# Example on real world dataset

In [25]:
def showrst(df):
    print(df.to_markdown(tablefmt="grid"))


In [26]:
import pandas as pd
import nannyml as nml
# load data
reference, analysis, analysis_gt = nml.datasets.load_modified_california_housing_dataset()
reference.head(3)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,timestamp,partition,clf_target,y_pred_proba,identifier
0,9.8413,32.0,7.17004,1.014845,4353.0,2.937247,34.22,-118.19,2020-10-01 00:00:00,reference,1,0.99,0
1,8.3695,37.0,7.458746,1.062706,941.0,3.105611,34.22,-118.21,2020-10-01 01:00:00,reference,1,1.0,1
2,8.72,44.0,6.16318,1.046025,668.0,2.794979,34.2,-118.18,2020-10-01 02:00:00,reference,1,1.0,2


In [27]:
# extract metadata, add gt column name
metadata = nml.extract_metadata(reference)
metadata.target_column_name = 'clf_target'
metadata.timestamp_column_name = 'timestamp'

In [28]:
# fit performance estimator and estimate for combined reference and analysis
cbpe = nml.CBPE(model_metadata=metadata, chunk_period='M')
cbpe.fit(reference_data=reference)
est_perf = cbpe.estimate(pd.concat([reference, analysis]))


The resulting list of chunks contains 1 underpopulated chunks. They contain too few records to be statistically relevant and might negatively influence the quality of calculations. Please consider splitting your data in a different way or continue at your own risk.



In [29]:
est_perf.data.tail(3)

Unnamed: 0,key,start_index,end_index,start_date,end_date,partition,estimated_roc_auc,confidence,upper_threshold,lower_threshold,alert
17,2022-03,6552,7295,2022-03-01,2022-03-31 23:59:59,analysis,0.829077,0.051046,1.0,0.708336,False
18,2022-04,7296,8015,2022-04-01,2022-04-30 23:59:59,analysis,0.910661,0.051046,1.0,0.708336,False
19,2022-05,8016,8231,2022-05-01,2022-05-09 23:59:59,analysis,0.939883,0.051046,1.0,0.708336,False


In [30]:
est_perf.data = est_perf.data[:-1].copy()
est_perf.data.tail(2)

Unnamed: 0,key,start_index,end_index,start_date,end_date,partition,estimated_roc_auc,confidence,upper_threshold,lower_threshold,alert
17,2022-03,6552,7295,2022-03-01,2022-03-31 23:59:59,analysis,0.829077,0.051046,1.0,0.708336,False
18,2022-04,7296,8015,2022-04-01,2022-04-30 23:59:59,analysis,0.910661,0.051046,1.0,0.708336,False


In [None]:
fig = est_perf.plot(kind='performance')
fig.show()

In [32]:
fig.write_image(file="../_static/example_california_performance.svg", engine="orca")

ValueError: Image generation requires the psutil package.

Install using pip:
    $ pip install psutil

Install using conda:
    $ conda install psutil


In [None]:
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
# add ground truth to analysis
analysis_full = pd.merge(analysis,analysis_gt, on = 'identifier')
df_all = pd.concat([reference, analysis_full]).reset_index(drop=True)
df_all['timestamp'] = pd.to_datetime(df_all['timestamp'])
# calculate actual ROC AUC
target_col = metadata.target_column_name
pred_score_col = 'y_pred_proba'
actual_performance = []
for idx in est_perf.data.index:
    start_date, end_date = est_perf.data.loc[idx, 'start_date'], est_perf.data.loc[idx, 'end_date']
    sub = df_all[df_all['timestamp'].between(start_date, end_date)]
    actual_perf = roc_auc_score(sub[target_col], sub[pred_score_col])
    est_perf.data.loc[idx, 'actual_roc_auc'] = actual_perf
# plot
first_analysis = est_perf.data[est_perf.data['partition']=='analysis']['key'].values[0]
plt.plot(est_perf.data['key'], est_perf.data['estimated_roc_auc'], label='estimated AUC')
plt.plot(est_perf.data['key'], est_perf.data['actual_roc_auc'], label='actual ROC AUC')
plt.xticks(rotation=90)
plt.axvline(x=first_analysis, label='First analysis chunk', linestyle=':', color='grey')
plt.ylabel('ROC AUC')
plt.legend()
plt.show()

In [None]:
univariate_calculator = nml.UnivariateStatisticalDriftCalculator(model_metadata=metadata, chunk_period='M')
univariate_calculator.fit(reference_data=reference)
univariate_results = univariate_calculator.calculate(data=pd.concat([analysis]))
nml.Ranker.by('alert_count').rank(univariate_results, metadata)

In [None]:
# get columns with d statistics only
d_stat_cols = [x for x in univariate_results.data if 'dstat' in x]
univariate_results.data[d_stat_cols].mean().sort_values(ascending=False)

In [None]:
for label in ['Longitude', 'Latitude']:
    fig = univariate_results.plot(
        kind='feature_distribution',
        feature_label=label)
    fig.show()
    fig.write_image("../_static/example_california_performance_distribution_{}.svg".format(label), engine="orca")

In [None]:
analysis_res = est_perf_data[est_perf_data['partition']=='analysis']
plt.figure(figsize=(8,6))
for idx in analysis_res.index[:10]:
    start_date, end_date = analysis_res.loc[idx, 'start_date'], analysis_res.loc[idx, 'end_date']
    sub = df_all[df_all['timestamp'].between(start_date, end_date)]
    plt.scatter(sub['Latitude'], sub['Longitude'], s=5, label="Chunk {}".format(str(idx)))
plt.legend()
plt.xlabel('Latitude')
plt.ylabel('Longitude')