In [1]:
!pwd

/home/tungdao/tung/mlopsvn/code/mlops-crash-course-code/monitoring_service/nbs


In [2]:
import pandas as pd
import fastparquet
from pathlib import Path
import numpy as np

random_seed = 17
np.random.seed(random_seed)

## Load data

In [3]:
OUTSIDE_DATA_DIR = Path("../data")
ORIG_DATA_PATH = OUTSIDE_DATA_DIR / "mock_normal_data.parquet"
DRIFT_DATA_PATH = OUTSIDE_DATA_DIR / "mock_drift_data.parquet"
REQUEST_DATA_PATH = OUTSIDE_DATA_DIR / "mock_request_data.csv"

In [4]:
def format_data_df(df, pred_prob):
    n_rows = df.shape[0]
    choices = np.random.choice([0, 1], size=n_rows, p=pred_prob)
    df = df.assign(prediction=choices)
    choices = np.random.choice([0, 1], size=n_rows, p=pred_prob)
    df = df.assign(trip_completed=choices)
    return df

In [5]:
orig_df = pd.read_parquet(ORIG_DATA_PATH, engine='fastparquet')
orig_df = format_data_df(orig_df, [0.5, 0.5])
orig_df

Unnamed: 0,datetime,driver_id,conv_rate,acc_rate,avg_daily_trips,created,prediction,trip_completed
0,2021-07-19 23:00:00+00:00,1001,0.186341,0.226879,107,2021-07-28 11:08:04.802,1,1
1,2021-07-18 06:00:00+00:00,1002,0.071032,0.22949,250,2021-07-28 11:08:04.802,1,1
2,2021-07-28 09:00:00+00:00,1003,0.05,0.192864,103,2021-07-28 11:08:04.802,0,1
3,2021-07-27 10:00:00+00:00,1004,0.184332,0.05,49,2021-07-28 11:08:04.802,0,0
4,2021-07-23 05:00:00+00:00,1005,0.25,0.25,246,2021-07-28 11:08:04.802,1,1


In [6]:
mock_df = pd.read_parquet(DRIFT_DATA_PATH, engine='fastparquet')
mock_df = format_data_df(mock_df, [0.2, 0.8])
mock_df

Unnamed: 0,datetime,driver_id,conv_rate,acc_rate,avg_daily_trips,created,prediction,trip_completed
0,2021-07-19 23:00:00+00:00,1001,0.886341,0.926879,807,2021-07-28 11:08:04.802,1,1
1,2021-07-18 06:00:00+00:00,1002,0.771032,0.92949,950,2021-07-28 11:08:04.802,0,1
2,2021-07-28 09:00:00+00:00,1003,0.75,0.892864,803,2021-07-28 11:08:04.802,1,1
3,2021-07-27 10:00:00+00:00,1004,0.884332,0.75,750,2021-07-28 11:08:04.802,1,1
4,2021-07-23 05:00:00+00:00,1005,0.95,0.95,946,2021-07-28 11:08:04.802,0,1


In [7]:
orig_df.describe()

Unnamed: 0,driver_id,conv_rate,acc_rate,avg_daily_trips,prediction,trip_completed
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,1003.0,0.148341,0.189847,151.0,0.6,0.8
std,1.581139,0.084737,0.08082,91.47404,0.547723,0.447214
min,1001.0,0.05,0.05,49.0,0.0,0.0
25%,1002.0,0.071032,0.192864,103.0,0.0,1.0
50%,1003.0,0.184332,0.226879,107.0,1.0,1.0
75%,1004.0,0.186341,0.22949,246.0,1.0,1.0
max,1005.0,0.25,0.25,250.0,1.0,1.0


In [8]:
mock_df.describe()

Unnamed: 0,driver_id,conv_rate,acc_rate,avg_daily_trips,prediction,trip_completed
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,1003.0,0.848341,0.889847,851.2,0.6,1.0
std,1.581139,0.084737,0.08082,91.195943,0.547723,0.0
min,1001.0,0.75,0.75,750.0,0.0,1.0
25%,1002.0,0.771032,0.892864,803.0,0.0,1.0
50%,1003.0,0.884332,0.926879,807.0,1.0,1.0
75%,1004.0,0.886341,0.92949,946.0,1.0,1.0
max,1005.0,0.95,0.95,950.0,1.0,1.0


## Check data quality

In [9]:
import dataclasses
from typing import Dict, List, Optional

from evidently.pipeline.column_mapping import ColumnMapping
from evidently.model_monitoring import ModelMonitoring
from evidently.model_monitoring import CatTargetDriftMonitor
from evidently.model_monitoring import ClassificationPerformanceMonitor
from evidently.model_monitoring import DataDriftMonitor
from evidently.model_monitoring import DataQualityMonitor
from evidently.model_monitoring import NumTargetDriftMonitor
from evidently.model_monitoring import ProbClassificationPerformanceMonitor
from evidently.model_monitoring import RegressionPerformanceMonitor

from evidently.dashboard import Dashboard
from evidently.dashboard.tabs import DataDriftTab, CatTargetDriftTab

from evidently.model_profile import Profile
from evidently.model_profile.sections import DataDriftProfileSection, CatTargetDriftProfileSection, ClassificationPerformanceProfileSection

@dataclasses.dataclass
class LoadedDataset:
    name: str
    references: pd.DataFrame
    monitors: List[str]
    column_mapping: ColumnMapping

EVIDENTLY_MONITORS_MAPPING = {
    "cat_target_drift": CatTargetDriftMonitor,
    "data_drift": DataDriftMonitor,
    "data_quality": DataQualityMonitor,
    "num_target_drift": NumTargetDriftMonitor,
    "regression_performance": RegressionPerformanceMonitor,
    "classification_performance": ClassificationPerformanceMonitor,
    "prob_classification_performance": ProbClassificationPerformanceMonitor,
}

In [10]:
DATETIME_COL = "datetime"
NUMERICAL_COLS = ["conv_rate", "acc_rate", "avg_daily_trips"]
CATEGORICAL_COLS = []
TARGET_COL = "trip_completed"
PREDICTION_COL = "prediction"
column_mapping = ColumnMapping(
    target=TARGET_COL,
    prediction=PREDICTION_COL,
    numerical_features=NUMERICAL_COLS,
    categorical_features=CATEGORICAL_COLS,
    datetime=DATETIME_COL
)
references = orig_df
current_data = mock_df

column_mapping

ColumnMapping(target='trip_completed', prediction='prediction', datetime='datetime', id=None, numerical_features=['conv_rate', 'acc_rate', 'avg_daily_trips'], categorical_features=[], datetime_features=None, target_names=None, task=None, pos_label=1)

In [11]:
monitors = ["data_drift", "classification_performance", "cat_target_drift"]
monitoring = ModelMonitoring(
    monitors=[EVIDENTLY_MONITORS_MAPPING[k]() for k in monitors],
    options=[],
)
monitoring.execute(references, current_data, column_mapping)
for metric, value, labels in monitoring.metrics():
    report = f"{metric.name} | {value} | {labels}"
    print(report)

data_drift:share_drifted_features | 0.6 | None
data_drift:n_drifted_features | 3 | None
data_drift:dataset_drift | True | None
data_drift:p_value | 0.29184054514378865 | {'feature': 'trip_completed', 'feature_type': 'cat'}
data_drift:p_value | 1.0 | {'feature': 'prediction', 'feature_type': 'cat'}
data_drift:p_value | 0.007936507936507936 | {'feature': 'acc_rate', 'feature_type': 'num'}
data_drift:p_value | 0.007936507936507936 | {'feature': 'avg_daily_trips', 'feature_type': 'num'}
data_drift:p_value | 0.007936507936507936 | {'feature': 'conv_rate', 'feature_type': 'num'}
classification_performance:quality | 0.8 | {'dataset': 'reference', 'metric': 'accuracy'}
classification_performance:quality | 0.75 | {'dataset': 'reference', 'metric': 'precision'}
classification_performance:quality | 0.875 | {'dataset': 'reference', 'metric': 'recall'}
classification_performance:quality | 0.7619047619047619 | {'dataset': 'reference', 'metric': 'f1'}
classification_performance:class_quality | 0.5 | 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
