In [None]:
import nannyml as nml
import pandas as pd
from IPython.display import display

In [None]:
from docs.utils import print_some_of_the_columns_only_markdown, print_table

In [None]:
reference_df, analysis_df, _ = nml.load_us_census_ma_employment_data()
display(reference_df.head())
display(analysis_df.head())

Unnamed: 0,id,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,...,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,employed,year,prediction,predicted_probability
0,0,62.0,16,1,17,1,0,1,1,4,...,1,2,2,2,2,1,0,2015,0,0.121211
1,1,48.0,21,1,0,2,0,1,1,4,...,1,2,2,2,2,1,0,2015,1,0.816033
2,2,47.0,21,1,1,2,0,1,1,4,...,1,2,2,2,1,1,0,2015,1,0.951815
3,3,34.0,12,5,0,2,0,1,3,4,...,1,2,2,2,1,2,0,2015,1,0.563825
4,4,33.0,23,5,0,2,0,5,1,4,...,2,2,2,2,1,1,1,2015,1,0.944436


Unnamed: 0,id,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,year,prediction,predicted_probability
0,68785,46.0,21,1,0,2,0,3,1,4,2,1,2,2,2,1,1,2016,1,0.948828
1,68786,46.0,21,1,1,2,0,1,1,4,1,1,2,2,2,2,1,2016,1,0.772002
2,68787,12.0,9,5,2,2,1,1,1,0,2,1,2,2,2,2,1,2016,0,0.000149
3,68788,52.0,21,3,0,2,0,1,1,4,2,1,2,2,2,2,1,2016,1,0.90607
4,68789,21.0,18,5,2,2,0,1,1,4,2,1,2,2,2,1,1,2016,1,0.699663


In [None]:
print_some_of_the_columns_only_markdown(reference_df, 2, 5)

+----+------+--------+-------+---------+------------+--------+--------------+-------------------------+
|    | id   | AGEP   | ...   | RAC1P   | employed   | year   | prediction   | predicted_probability   |
| 0  | 0    | 62     | ...   | 1       | 0          | 2015   | 0            | 0.121211                |
+----+------+--------+-------+---------+------------+--------+--------------+-------------------------+
| 1  | 1    | 48     | ...   | 1       | 0          | 2015   | 1            | 0.816033                |
+----+------+--------+-------+---------+------------+--------+--------------+-------------------------+
| 2  | 2    | 47     | ...   | 1       | 0          | 2015   | 1            | 0.951815                |
+----+------+--------+-------+---------+------------+--------+--------------+-------------------------+
| 3  | 3    | 34     | ...   | 2       | 0          | 2015   | 1            | 0.563825                |
+----+------+--------+-------+---------+------------+--------+--

In [None]:
print_some_of_the_columns_only_markdown(analysis_df, 2, 5)

+----+-------+--------+-------+-------+---------+--------+--------------+-------------------------+
|    | id    | AGEP   | ...   | SEX   | RAC1P   | year   | prediction   | predicted_probability   |
| 0  | 68785 | 46     | ...   | 1     | 1       | 2016   | 1            | 0.948828                |
+----+-------+--------+-------+-------+---------+--------+--------------+-------------------------+
| 1  | 68786 | 46     | ...   | 2     | 1       | 2016   | 1            | 0.772002                |
+----+-------+--------+-------+-------+---------+--------+--------------+-------------------------+
| 2  | 68787 | 12     | ...   | 2     | 1       | 2016   | 0            | 0.000149194             |
+----+-------+--------+-------+-------+---------+--------+--------------+-------------------------+
| 3  | 68788 | 52     | ...   | 2     | 1       | 2016   | 1            | 0.90607                 |
+----+-------+--------+-------+-------+---------+--------+--------------+-------------------------+


In [None]:
chunk_size = 5000

In [None]:
estimator = nml.CBPE(
    problem_type='classification_binary',
    y_pred_proba='predicted_probability',
    y_pred='prediction',
    y_true='employed',
    metrics=['roc_auc'],
    chunk_size=chunk_size,
)

In [None]:
estimator = estimator.fit(reference_df)
estimated_performance = estimator.estimate(analysis_df)



In [None]:
figure = estimated_performance.plot()
figure.show()

In [None]:
figure.write_image('../_static/quickstart/quick-start-perf-est.svg', width=1000)

In [None]:
feature_column_names = ['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG',
                        'MIL', 'ANC', 'NATIVITY', 'DEAR', 'DEYE', 'DREM', 'SEX', 'RAC1P']

univariate_calculator = nml.UnivariateDriftCalculator(
    column_names=feature_column_names,
    chunk_size=chunk_size
)

univariate_calculator.fit(reference_df)
univariate_drift = univariate_calculator.calculate(analysis_df)

In [None]:
alert_count_ranker = nml.AlertCountRanker()
alert_count_ranked_features = alert_count_ranker.rank(univariate_drift)
display(alert_count_ranked_features.head())

Unnamed: 0,number_of_alerts,column_name,rank
0,36,ANC,1
1,29,AGEP,2
2,28,DREM,3
3,27,RELP,4
4,26,SCHL,5


In [None]:
print_table(alert_count_ranked_features.head())

+----+--------------------+---------------+--------+
|    | number_of_alerts   | column_name   | rank   |
| 0  | 36                 | ANC           | 1      |
+----+--------------------+---------------+--------+
| 1  | 29                 | AGEP          | 2      |
+----+--------------------+---------------+--------+
| 2  | 28                 | DREM          | 3      |
+----+--------------------+---------------+--------+
| 3  | 27                 | RELP          | 4      |
+----+--------------------+---------------+--------+
| 4  | 26                 | SCHL          | 5      |
+----+--------------------+---------------+--------+


In [None]:
figure = univariate_drift.filter(column_names=['RELP','AGEP', 'SCHL']).plot()
figure.show()

In [None]:
figure.write_image(f'../_static/quickstart/quick-start-drift.svg', width=1000)

In [None]:
uni_drift_AGEP_analysis = univariate_drift.filter(column_names=['AGEP'], period='analysis')
figure = estimated_performance.compare(uni_drift_AGEP_analysis).plot()
figure.show()

In [None]:
figure.write_image(f'../_static/quickstart/quick-start-drift-n-performance.svg', width=1000)

In [None]:
figure = univariate_drift.filter(period='analysis', column_names=['RELP','AGEP', 'SCHL']).plot(kind='distribution')
figure.show()

In [None]:
figure.write_image(f'../_static/quickstart/quick-start-univariate-distribution.svg', width=1000)

In [None]:
_, _, analysis_targets_df = nml.load_us_census_ma_employment_data()

In [None]:
analysis_with_targets_df = pd.concat([analysis_df, analysis_targets_df], axis=1)
display(analysis_with_targets_df.head())

Unnamed: 0,id,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,...,DEAR,DEYE,DREM,SEX,RAC1P,year,prediction,predicted_probability,id.1,employed
0,68785,46.0,21,1,0,2,0,3,1,4,...,2,2,2,1,1,2016,1,0.948828,68785,1
1,68786,46.0,21,1,1,2,0,1,1,4,...,2,2,2,2,1,2016,1,0.772002,68786,1
2,68787,12.0,9,5,2,2,1,1,1,0,...,2,2,2,2,1,2016,0,0.000149,68787,0
3,68788,52.0,21,3,0,2,0,1,1,4,...,2,2,2,2,1,2016,1,0.90607,68788,1
4,68789,21.0,18,5,2,2,0,1,1,4,...,2,2,2,1,1,2016,1,0.699663,68789,0


In [None]:
print_some_of_the_columns_only_markdown(analysis_with_targets_df.head(), 2, 5)

+----+-------+--------+-------+--------+--------------+-------------------------+-------+------------+
|    | id    | AGEP   | ...   | year   | prediction   | predicted_probability   | id    | employed   |
| 0  | 68785 | 46     | ...   | 2016   | 1            | 0.948828                | 68785 | 1          |
+----+-------+--------+-------+--------+--------------+-------------------------+-------+------------+
| 1  | 68786 | 46     | ...   | 2016   | 1            | 0.772002                | 68786 | 1          |
+----+-------+--------+-------+--------+--------------+-------------------------+-------+------------+
| 2  | 68787 | 12     | ...   | 2016   | 0            | 0.000149194             | 68787 | 0          |
+----+-------+--------+-------+--------+--------------+-------------------------+-------+------------+
| 3  | 68788 | 52     | ...   | 2016   | 1            | 0.90607                 | 68788 | 1          |
+----+-------+--------+-------+--------+--------------+------------------

In [None]:
performance_calculator = nml.PerformanceCalculator(
    problem_type='classification_binary',
    y_pred_proba='predicted_probability',
    y_pred='prediction',
    y_true='employed',
    metrics=['roc_auc'],
    chunk_size=chunk_size)

performance_calculator.fit(reference_df)
calculated_performance = performance_calculator.calculate(analysis_with_targets_df)

figure = estimated_performance.filter(period='analysis').compare(calculated_performance).plot()
figure.show()

In [None]:
figure.write_image(f'../_static/quickstart/quick-start-estimated-and-realized.svg', width=1000)