In [1]:
!pwd

/Users/tung.dao/tung/mlopsvn/code/mlops-crash-course-code/monitoring_service/nbs


In [2]:
import pandas as pd
import fastparquet
from pathlib import Path
import numpy as np

random_seed = 17
np.random.seed(random_seed)

## Load data

In [22]:
OUTSIDE_DATA_DIR = Path("../data")
ORIG_DATA_PATH = OUTSIDE_DATA_DIR / "orig_driver_stats.parquet"
MOCK_DATA_PATH = OUTSIDE_DATA_DIR / "mock_driver_stats.parquet"
if not ORIG_DATA_PATH.is_file():
    raise Exception(f"{ORIG_DATA_PATH} not found")
if not MOCK_DATA_PATH.is_file():
    raise Exception(f"{MOCK_DATA_PATH} not found")

In [23]:
orig_df = pd.read_parquet(ORIG_DATA_PATH, engine='fastparquet')
orig_df

Unnamed: 0,datetime,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2021-07-13 11:00:00+00:00,1005,0.373837,0.154890,498,2021-07-28 11:08:04.802
1,2021-07-13 12:00:00+00:00,1005,0.571627,0.643958,656,2021-07-28 11:08:04.802
2,2021-07-13 13:00:00+00:00,1005,0.399909,0.993888,722,2021-07-28 11:08:04.802
3,2021-07-13 14:00:00+00:00,1005,0.967468,0.788458,424,2021-07-28 11:08:04.802
4,2021-07-13 15:00:00+00:00,1005,0.024679,0.956064,569,2021-07-28 11:08:04.802
...,...,...,...,...,...,...
1802,2021-07-28 09:00:00+00:00,1001,0.089418,0.311234,485,2021-07-28 11:08:04.802
1803,2021-07-28 10:00:00+00:00,1001,0.222534,0.927691,114,2021-07-28 11:08:04.802
1804,2021-04-12 07:00:00+00:00,1001,0.175219,0.761434,385,2021-07-28 11:08:04.802
902,2021-07-20 23:00:00+00:00,1003,0.025968,0.109748,55,2021-07-28 11:08:04.802


In [40]:
mock_df = pd.read_parquet(MOCK_DATA_PATH, engine='fastparquet')
mock_df['datetime'] = mock_df['event_timestamp']
mock_df

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,datetime
0,2021-07-19 23:00:00,1003,0.562670,0.711753,817,2021-07-19 23:00:00
1,2021-07-18 06:00:00,1005,0.747795,0.729159,664,2021-07-18 06:00:00
2,2021-07-28 09:00:00,1003,0.577423,0.600396,800,2021-07-28 09:00:00
3,2021-07-27 10:00:00,1002,0.676030,0.587644,820,2021-07-27 10:00:00
4,2021-07-23 05:00:00,1001,0.867539,0.571839,754,2021-07-23 05:00:00
...,...,...,...,...,...,...
95,2021-07-20 09:00:00,1004,0.784332,0.550629,741,2021-07-20 09:00:00
96,2021-07-23 14:00:00,1001,0.682082,0.600372,752,2021-07-23 14:00:00
97,2021-07-24 12:00:00,1004,0.732227,0.874406,841,2021-07-24 12:00:00
98,2021-07-27 17:00:00,1003,0.768284,0.835585,769,2021-07-27 17:00:00


In [26]:
orig_df.describe()

Unnamed: 0,driver_id,conv_rate,acc_rate,avg_daily_trips
count,1807.0,1807.0,1807.0,1807.0
mean,1003.0,0.488267,0.505205,500.871057
std,1.413822,0.291862,0.29123,293.412315
min,1001.0,0.000482,0.000542,0.0
25%,1002.0,0.238879,0.251682,236.0
50%,1003.0,0.491606,0.507843,506.0
75%,1004.0,0.732576,0.770225,754.0
max,1005.0,0.998767,0.999445,998.0


In [27]:
mock_df.describe()

Unnamed: 0,driver_id,conv_rate,acc_rate,avg_daily_trips
count,100.0,100.0,100.0,100.0
mean,1002.76,0.731518,0.701609,751.45
std,1.341791,0.090697,0.095636,78.185402
min,1001.0,0.5,0.5,500.0
25%,1001.75,0.674897,0.627561,698.0
50%,1003.0,0.728371,0.688873,749.0
75%,1004.0,0.786183,0.771097,796.25
max,1005.0,0.995,0.995,994.0


## Check data quality

In [50]:
import dataclasses
from typing import Dict, List, Optional

from evidently.pipeline.column_mapping import ColumnMapping
from evidently.model_monitoring import ModelMonitoring
from evidently.model_monitoring import CatTargetDriftMonitor
from evidently.model_monitoring import ClassificationPerformanceMonitor
from evidently.model_monitoring import DataDriftMonitor
from evidently.model_monitoring import DataQualityMonitor
from evidently.model_monitoring import NumTargetDriftMonitor
from evidently.model_monitoring import ProbClassificationPerformanceMonitor
from evidently.model_monitoring import RegressionPerformanceMonitor

from evidently.dashboard import Dashboard
from evidently.dashboard.tabs import DataDriftTab, CatTargetDriftTab

from evidently.model_profile import Profile
from evidently.model_profile.sections import DataDriftProfileSection, CatTargetDriftProfileSection

@dataclasses.dataclass
class LoadedDataset:
    name: str
    references: pd.DataFrame
    monitors: List[str]
    column_mapping: ColumnMapping

EVIDENTLY_MONITORS_MAPPING = {
    "cat_target_drift": CatTargetDriftMonitor,
    "data_drift": DataDriftMonitor,
    "data_quality": DataQualityMonitor,
    "num_target_drift": NumTargetDriftMonitor,
    "regression_performance": RegressionPerformanceMonitor,
    "classification_performance": ClassificationPerformanceMonitor,
    "prob_classification_performance": ProbClassificationPerformanceMonitor,
}

In [55]:
numerical_features = ["conv_rate", "acc_rate", "avg_daily_trips"]
categorical_features = []
target = "trip_completed"
column_mapping = ColumnMapping(
    target=target,
    numerical_features=numerical_features,
    categorical_features=categorical_features,
    datetime='datetime'
)
references = orig_df
current_data = mock_df

column_mapping

ColumnMapping(target='trip_completed', prediction='prediction', datetime='datetime', id=None, numerical_features=['conv_rate', 'acc_rate', 'avg_daily_trips'], categorical_features=[], datetime_features=None, target_names=None, task=None, pos_label=1)

In [56]:
monitors = ["data_drift", "classification_performance", "cat_target_drift"]
monitoring = ModelMonitoring(
    monitors=[EVIDENTLY_MONITORS_MAPPING[k]() for k in monitors],
    options=[],
)
monitoring.execute(references, current_data, column_mapping)
for metric, value, labels in monitoring.metrics():
    report = f"{metric.name} | {value} | {labels}"
    print(report)

data_drift:share_drifted_features | 1.0 | None
data_drift:n_drifted_features | 3 | None
data_drift:dataset_drift | True | None
data_drift:p_value | 0.7746270563557061 | {'feature': 'acc_rate', 'feature_type': 'num'}
data_drift:p_value | 0.9090495641010653 | {'feature': 'avg_daily_trips', 'feature_type': 'num'}
data_drift:p_value | 0.8911643440142998 | {'feature': 'conv_rate', 'feature_type': 'num'}
cat_target_drift:count | 1807 | {'dataset': 'prediction'}
cat_target_drift:count | 100 | {'dataset': 'current'}


In [52]:
bcancer_data_and_target_drift_dashboard = Dashboard(tabs=[DataDriftTab(verbose_level=0), CatTargetDriftTab(verbose_level=0)])
bcancer_data_and_target_drift_dashboard.calculate(references, current_data, column_mapping=column_mapping)
# bcancer_data_and_target_drift_dashboard.show()
bcancer_data_and_target_drift_dashboard.save('data_and_target_drift.html')

In [54]:
bcancer_target_and_data_drift_profile = Profile(sections=[DataDriftProfileSection(), CatTargetDriftProfileSection()])
bcancer_target_and_data_drift_profile.calculate(references, current_data, column_mapping=column_mapping)
bcancer_target_and_data_drift_profile.json()

'{"data_drift": {"name": "data_drift", "datetime": "2022-09-26 18:21:53.403915", "data": {"utility_columns": {"date": "datetime", "id": null, "target": null, "prediction": null}, "cat_feature_names": [], "num_feature_names": ["acc_rate", "avg_daily_trips", "conv_rate"], "datetime_feature_names": ["created"], "target_names": null, "options": {"confidence": null, "drift_share": 0.5, "nbinsx": 10, "xbins": null}, "metrics": {"n_features": 3, "n_drifted_features": 3, "share_drifted_features": 1.0, "dataset_drift": true, "acc_rate": {"current_small_hist": [[0.6060606060606062, 1.8181818181818188, 4.848484848484839, 3.030303030303038, 3.0303030303030245, 4.646464646464648, 1.0101010101010104, 0.6060606060606062, 0.20202020202020207, 0.40404040404040414], [0.5, 0.5495, 0.599, 0.6485000000000001, 0.698, 0.7475, 0.797, 0.8465, 0.896, 0.9455, 0.995]], "ref_small_hist": [[0.9639796949027413, 1.0027604872264149, 1.0526215059282804, 0.9473593553354527, 0.9584395817136451, 0.9861401476591257, 1.0138