In [1]:
!pwd

/Users/tung.dao/tung/mlopsvn/code/mlops-crash-course-code/monitoring_service/nbs


In [2]:
import pandas as pd
import fastparquet
from pathlib import Path
import numpy as np

random_seed = 17
np.random.seed(random_seed)

## Load data

In [4]:
OUTSIDE_DATA_DIR = Path("../data")
DATA_PATH = OUTSIDE_DATA_DIR / "orig_driver_stats.parquet"
if not DATA_PATH.is_file():
    raise Exception("DATA_PATH not found")

In [5]:
df_orig = pd.read_parquet(DATA_PATH, engine='fastparquet')
df_orig

Unnamed: 0,datetime,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2021-07-13 11:00:00+00:00,1005,0.373837,0.154890,498,2021-07-28 11:08:04.802
1,2021-07-13 12:00:00+00:00,1005,0.571627,0.643958,656,2021-07-28 11:08:04.802
2,2021-07-13 13:00:00+00:00,1005,0.399909,0.993888,722,2021-07-28 11:08:04.802
3,2021-07-13 14:00:00+00:00,1005,0.967468,0.788458,424,2021-07-28 11:08:04.802
4,2021-07-13 15:00:00+00:00,1005,0.024679,0.956064,569,2021-07-28 11:08:04.802
...,...,...,...,...,...,...
1802,2021-07-28 09:00:00+00:00,1001,0.089418,0.311234,485,2021-07-28 11:08:04.802
1803,2021-07-28 10:00:00+00:00,1001,0.222534,0.927691,114,2021-07-28 11:08:04.802
1804,2021-04-12 07:00:00+00:00,1001,0.175219,0.761434,385,2021-07-28 11:08:04.802
902,2021-07-20 23:00:00+00:00,1003,0.025968,0.109748,55,2021-07-28 11:08:04.802


In [6]:
df_orig.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1807 entries, 0 to 902
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   datetime         1807 non-null   datetime64[ns, UTC]
 1   driver_id        1807 non-null   int64              
 2   conv_rate        1807 non-null   float64            
 3   acc_rate         1807 non-null   float64            
 4   avg_daily_trips  1807 non-null   int64              
 5   created          1807 non-null   datetime64[ns]     
dtypes: datetime64[ns, UTC](1), datetime64[ns](1), float64(2), int64(2)
memory usage: 98.8 KB


In [7]:
desc_df = df_orig.describe()
desc_df

Unnamed: 0,driver_id,conv_rate,acc_rate,avg_daily_trips
count,1807.0,1807.0,1807.0,1807.0
mean,1003.0,0.488267,0.505205,500.871057
std,1.413822,0.291862,0.29123,293.412315
min,1001.0,0.000482,0.000542,0.0
25%,1002.0,0.238879,0.251682,236.0
50%,1003.0,0.491606,0.507843,506.0
75%,1004.0,0.732576,0.770225,754.0
max,1005.0,0.998767,0.999445,998.0


## Mock feature drift data

In [8]:
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler

In [9]:
N_SAMPLES = 100

X, y = make_classification(n_samples=N_SAMPLES, random_state=random_seed)
# Shift feature range
scaler = MinMaxScaler(feature_range=(0.5, 0.995))
X = scaler.fit_transform(X)
# Shift target
y[X[:, 3] > 0.75] = 1

In [10]:
mock_df = pd.DataFrame()
mock_df['conv_rate'] = X[:, 0]
mock_df['acc_rate'] = X[:, 1]
mock_df['avg_daily_trips'] = np.array((X[:, 2] * 1000), dtype=int)
mock_df['trip_completed'] = y
mock_df.describe()

Unnamed: 0,conv_rate,acc_rate,avg_daily_trips,trip_completed
count,100.0,100.0,100.0,100.0
mean,0.731518,0.701609,751.45,0.83
std,0.090697,0.095636,78.185402,0.377525
min,0.5,0.5,500.0,0.0
25%,0.674897,0.627561,698.0,1.0
50%,0.728371,0.688873,749.0,1.0
75%,0.786183,0.771097,796.25,1.0
max,0.995,0.995,994.0,1.0


In [11]:
shuffled_orig_df = df_orig.sample(frac=1, random_state=random_seed)
shuffled_orig_df['datetime'][:len(mock_df)]
mock_df = mock_df.assign(event_timestamp=shuffled_orig_df['datetime'][:len(mock_df)].values)
mock_df = mock_df.assign(driver_id=shuffled_orig_df['driver_id'][:len(mock_df)].values)
mock_df

Unnamed: 0,conv_rate,acc_rate,avg_daily_trips,trip_completed,event_timestamp,driver_id
0,0.562670,0.711753,817,1,2021-07-19 23:00:00,1003
1,0.747795,0.729159,664,1,2021-07-18 06:00:00,1005
2,0.577423,0.600396,800,1,2021-07-28 09:00:00,1003
3,0.676030,0.587644,820,1,2021-07-27 10:00:00,1002
4,0.867539,0.571839,754,0,2021-07-23 05:00:00,1001
...,...,...,...,...,...,...
95,0.784332,0.550629,741,0,2021-07-20 09:00:00,1004
96,0.682082,0.600372,752,0,2021-07-23 14:00:00,1001
97,0.732227,0.874406,841,1,2021-07-24 12:00:00,1004
98,0.768284,0.835585,769,1,2021-07-27 17:00:00,1003


In [12]:
cols = ['event_timestamp', 'driver_id', 'conv_rate', 'acc_rate', 'avg_daily_trips', 'trip_completed']
mock_df = mock_df.reindex(columns=cols)
mock_df

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips,trip_completed
0,2021-07-19 23:00:00,1003,0.562670,0.711753,817,1
1,2021-07-18 06:00:00,1005,0.747795,0.729159,664,1
2,2021-07-28 09:00:00,1003,0.577423,0.600396,800,1
3,2021-07-27 10:00:00,1002,0.676030,0.587644,820,1
4,2021-07-23 05:00:00,1001,0.867539,0.571839,754,0
...,...,...,...,...,...,...
95,2021-07-20 09:00:00,1004,0.784332,0.550629,741,0
96,2021-07-23 14:00:00,1001,0.682082,0.600372,752,0
97,2021-07-24 12:00:00,1004,0.732227,0.874406,841,1
98,2021-07-27 17:00:00,1003,0.768284,0.835585,769,1


In [13]:
mock_df.describe()

Unnamed: 0,driver_id,conv_rate,acc_rate,avg_daily_trips,trip_completed
count,100.0,100.0,100.0,100.0,100.0
mean,1002.76,0.731518,0.701609,751.45,0.83
std,1.341791,0.090697,0.095636,78.185402,0.377525
min,1001.0,0.5,0.5,500.0,0.0
25%,1001.75,0.674897,0.627561,698.0,1.0
50%,1003.0,0.728371,0.688873,749.0,1.0
75%,1004.0,0.786183,0.771097,796.25,1.0
max,1005.0,0.995,0.995,994.0,1.0


In [14]:
stats_cols = ['event_timestamp', 'driver_id', 'conv_rate', 'acc_rate', 'avg_daily_trips']
mock_stats = mock_df[stats_cols]
mock_stats

Unnamed: 0,event_timestamp,driver_id,conv_rate,acc_rate,avg_daily_trips
0,2021-07-19 23:00:00,1003,0.562670,0.711753,817
1,2021-07-18 06:00:00,1005,0.747795,0.729159,664
2,2021-07-28 09:00:00,1003,0.577423,0.600396,800
3,2021-07-27 10:00:00,1002,0.676030,0.587644,820
4,2021-07-23 05:00:00,1001,0.867539,0.571839,754
...,...,...,...,...,...
95,2021-07-20 09:00:00,1004,0.784332,0.550629,741
96,2021-07-23 14:00:00,1001,0.682082,0.600372,752
97,2021-07-24 12:00:00,1004,0.732227,0.874406,841
98,2021-07-27 17:00:00,1003,0.768284,0.835585,769


In [15]:
orders_cols = ['event_timestamp', 'driver_id', 'trip_completed']
mock_orders = mock_df[orders_cols][:20]
mock_orders

Unnamed: 0,event_timestamp,driver_id,trip_completed
0,2021-07-19 23:00:00,1003,1
1,2021-07-18 06:00:00,1005,1
2,2021-07-28 09:00:00,1003,1
3,2021-07-27 10:00:00,1002,1
4,2021-07-23 05:00:00,1001,0
5,2021-07-22 23:00:00,1001,1
6,2021-07-23 19:00:00,1004,1
7,2021-07-17 18:00:00,1001,1
8,2021-07-24 20:00:00,1001,1
9,2021-07-27 18:00:00,1002,1


## Save mock data

In [16]:
OUTSIDE_DATA_DIR = Path("../data")
MOCK_PATH = OUTSIDE_DATA_DIR / "mock_data.parquet"
MOCK_STATS = OUTSIDE_DATA_DIR / "mock_driver_stats.parquet"
MOCK_ORDERS = OUTSIDE_DATA_DIR / "mock_driver_orders.csv"

mock_df.to_parquet(MOCK_PATH, engine="fastparquet")
mock_stats.to_parquet(MOCK_STATS, engine="fastparquet")
mock_orders.to_parquet(MOCK_ORDERS, engine="fastparquet")