In [1]:
!pwd

/Users/tung.dao/tung/mlopsvn/code/mlops-crash-course-code/monitoring_service/nbs


In [2]:
import pandas as pd
import fastparquet
from pathlib import Path
import numpy as np

random_seed = 17
np.random.seed(random_seed)

## Load data

In [3]:
OUTSIDE_DATA_DIR = Path("../data")
ORIG_DATA_PATH = OUTSIDE_DATA_DIR / "mock_normal_data.parquet"
DRIFT_DATA_PATH = OUTSIDE_DATA_DIR / "mock_drift_data.parquet"
REQUEST_DATA_PATH = OUTSIDE_DATA_DIR / "mock_request_data.csv"

In [4]:
normal_df = pd.read_parquet(ORIG_DATA_PATH, engine='fastparquet')
normal_df

Unnamed: 0,datetime,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2021-07-19 23:00:00+00:00,1001,0.186341,0.226879,107,2021-07-28 11:08:04.802
1,2021-07-18 06:00:00+00:00,1002,0.071032,0.22949,250,2021-07-28 11:08:04.802
2,2021-07-28 09:00:00+00:00,1003,0.05,0.192864,103,2021-07-28 11:08:04.802
3,2021-07-27 10:00:00+00:00,1004,0.184332,0.05,49,2021-07-28 11:08:04.802
4,2021-07-23 05:00:00+00:00,1005,0.25,0.25,246,2021-07-28 11:08:04.802


In [5]:
drift_df = pd.read_parquet(DRIFT_DATA_PATH, engine='fastparquet')
drift_df

Unnamed: 0,datetime,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2021-07-19 23:00:00+00:00,1001,0.886341,0.926879,807,2021-07-28 11:08:04.802
1,2021-07-18 06:00:00+00:00,1002,0.771032,0.92949,950,2021-07-28 11:08:04.802
2,2021-07-28 09:00:00+00:00,1003,0.75,0.892864,803,2021-07-28 11:08:04.802
3,2021-07-27 10:00:00+00:00,1004,0.884332,0.75,750,2021-07-28 11:08:04.802
4,2021-07-23 05:00:00+00:00,1005,0.95,0.95,946,2021-07-28 11:08:04.802


In [6]:
request_df = pd.read_csv(REQUEST_DATA_PATH)
request_df

Unnamed: 0,request_id,driver_ids,trip_completed
0,uuid-0,[1001],0
1,uuid-1,[1002],1
2,uuid-2,[1003],0
3,uuid-3,[1004],0
4,uuid-4,[1005],1


In [7]:
normal_df.describe()

Unnamed: 0,driver_id,conv_rate,acc_rate,avg_daily_trips
count,5.0,5.0,5.0,5.0
mean,1003.0,0.148341,0.189847,151.0
std,1.581139,0.084737,0.08082,91.47404
min,1001.0,0.05,0.05,49.0
25%,1002.0,0.071032,0.192864,103.0
50%,1003.0,0.184332,0.226879,107.0
75%,1004.0,0.186341,0.22949,246.0
max,1005.0,0.25,0.25,250.0


In [8]:
drift_df.describe()

Unnamed: 0,driver_id,conv_rate,acc_rate,avg_daily_trips
count,5.0,5.0,5.0,5.0
mean,1003.0,0.848341,0.889847,851.2
std,1.581139,0.084737,0.08082,91.195943
min,1001.0,0.75,0.75,750.0
25%,1002.0,0.771032,0.892864,803.0
50%,1003.0,0.884332,0.926879,807.0
75%,1004.0,0.886341,0.92949,946.0
max,1005.0,0.95,0.95,950.0


In [9]:
request_df.describe()

Unnamed: 0,trip_completed
count,5.0
mean,0.4
std,0.547723
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


## Check data quality

In [10]:
import dataclasses
from typing import Dict, List, Optional

from evidently.pipeline.column_mapping import ColumnMapping
from evidently.model_monitoring import ModelMonitoring
from evidently.model_monitoring import ClassificationPerformanceMonitor
from evidently.model_monitoring import DataDriftMonitor

@dataclasses.dataclass
class LoadedDataset:
    name: str
    references: pd.DataFrame
    monitors: List[str]
    column_mapping: ColumnMapping

In [11]:
column_mapping = ColumnMapping(
    target="trip_completed",
    prediction="prediction",
    numerical_features=["conv_rate", "acc_rate", "avg_daily_trips"],
    categorical_features=[],
)
column_mapping


ColumnMapping(target='trip_completed', prediction='prediction', datetime='datetime', id=None, numerical_features=['conv_rate', 'acc_rate', 'avg_daily_trips'], categorical_features=[], datetime_features=None, target_names=None, task=None, pos_label=1)

In [12]:
features_and_target_monitor = ModelMonitoring(monitors=[DataDriftMonitor()])
model_performance_monitor = ModelMonitoring(monitors=[ClassificationPerformanceMonitor()])

## Run data drift monitoring

In [13]:
def print_metrics(monitoring):
    for metric, value, labels in monitoring.metrics():
        report = f"{metric.name} | {value} | {labels}"
        print(report)

In [14]:
features_and_target_monitor.execute(
    reference_data=normal_df,
    current_data=drift_df,
    column_mapping=column_mapping,
)

print_metrics(features_and_target_monitor)

data_drift:share_drifted_features | 1.0 | None
data_drift:n_drifted_features | 3 | None
data_drift:dataset_drift | True | None
data_drift:p_value | 0.007936507936507936 | {'feature': 'acc_rate', 'feature_type': 'num'}
data_drift:p_value | 0.007936507936507936 | {'feature': 'avg_daily_trips', 'feature_type': 'num'}
data_drift:p_value | 0.007936507936507936 | {'feature': 'conv_rate', 'feature_type': 'num'}


## Run model performance monitoring

In [15]:
predictions = [1] * drift_df.shape[0]
drift_df = drift_df.assign(prediction=predictions)
drift_df = drift_df.assign(trip_completed=request_df["trip_completed"])
drift_df

Unnamed: 0,datetime,driver_id,conv_rate,acc_rate,avg_daily_trips,created,prediction,trip_completed
0,2021-07-19 23:00:00+00:00,1001,0.886341,0.926879,807,2021-07-28 11:08:04.802,1,0
1,2021-07-18 06:00:00+00:00,1002,0.771032,0.92949,950,2021-07-28 11:08:04.802,1,1
2,2021-07-28 09:00:00+00:00,1003,0.75,0.892864,803,2021-07-28 11:08:04.802,1,0
3,2021-07-27 10:00:00+00:00,1004,0.884332,0.75,750,2021-07-28 11:08:04.802,1,0
4,2021-07-23 05:00:00+00:00,1005,0.95,0.95,946,2021-07-28 11:08:04.802,1,1


In [16]:
model_performance_monitor.execute(
    reference_data=drift_df,
    current_data=drift_df,
    column_mapping=column_mapping,
)

print_metrics(model_performance_monitor)

classification_performance:quality | 0.4 | {'dataset': 'reference', 'metric': 'accuracy'}
classification_performance:quality | 0.2 | {'dataset': 'reference', 'metric': 'precision'}
classification_performance:quality | 0.5 | {'dataset': 'reference', 'metric': 'recall'}
classification_performance:quality | 0.28571428571428575 | {'dataset': 'reference', 'metric': 'f1'}
classification_performance:class_quality | 0.0 | {'dataset': 'reference', 'class_name': '0', 'metric': 'precision'}
classification_performance:class_quality | 0.0 | {'dataset': 'reference', 'class_name': '0', 'metric': 'recall'}
classification_performance:class_quality | 0.0 | {'dataset': 'reference', 'class_name': '0', 'metric': 'f1'}
classification_performance:class_quality | 0.4 | {'dataset': 'reference', 'class_name': '1', 'metric': 'precision'}
classification_performance:class_quality | 1.0 | {'dataset': 'reference', 'class_name': '1', 'metric': 'recall'}
classification_performance:class_quality | 0.5714285714285715 | 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
