In [1]:
!pwd

/home/tungdao/tung/mlopsvn/code/mlops-crash-course-code/monitoring_service/nbs


In [2]:
import pandas as pd
import fastparquet
from pathlib import Path
import numpy as np

random_seed = 17
np.random.seed(random_seed)

## Load data

In [3]:
OUTSIDE_DATA_DIR = Path("../data")
DATA_PATH = OUTSIDE_DATA_DIR / "orig_driver_stats.parquet"
if not DATA_PATH.is_file():
    raise Exception("DATA_PATH not found")

In [4]:
df_orig = pd.read_parquet(DATA_PATH, engine='fastparquet')
df_orig

Unnamed: 0,datetime,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2021-07-13 11:00:00+00:00,1005,0.373837,0.154890,498,2021-07-28 11:08:04.802
1,2021-07-13 12:00:00+00:00,1005,0.571627,0.643958,656,2021-07-28 11:08:04.802
2,2021-07-13 13:00:00+00:00,1005,0.399909,0.993888,722,2021-07-28 11:08:04.802
3,2021-07-13 14:00:00+00:00,1005,0.967468,0.788458,424,2021-07-28 11:08:04.802
4,2021-07-13 15:00:00+00:00,1005,0.024679,0.956064,569,2021-07-28 11:08:04.802
...,...,...,...,...,...,...
1802,2021-07-28 09:00:00+00:00,1001,0.089418,0.311234,485,2021-07-28 11:08:04.802
1803,2021-07-28 10:00:00+00:00,1001,0.222534,0.927691,114,2021-07-28 11:08:04.802
1804,2021-04-12 07:00:00+00:00,1001,0.175219,0.761434,385,2021-07-28 11:08:04.802
902,2021-07-20 23:00:00+00:00,1003,0.025968,0.109748,55,2021-07-28 11:08:04.802


In [5]:
df_orig.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1807 entries, 0 to 902
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   datetime         1807 non-null   datetime64[ns, UTC]
 1   driver_id        1807 non-null   int64              
 2   conv_rate        1807 non-null   float64            
 3   acc_rate         1807 non-null   float64            
 4   avg_daily_trips  1807 non-null   int64              
 5   created          1807 non-null   datetime64[ns]     
dtypes: datetime64[ns, UTC](1), datetime64[ns](1), float64(2), int64(2)
memory usage: 98.8 KB


In [6]:
desc_df = df_orig.describe()
desc_df

Unnamed: 0,driver_id,conv_rate,acc_rate,avg_daily_trips
count,1807.0,1807.0,1807.0,1807.0
mean,1003.0,0.488267,0.505205,500.871057
std,1.413822,0.291862,0.29123,293.412315
min,1001.0,0.000482,0.000542,0.0
25%,1002.0,0.238879,0.251682,236.0
50%,1003.0,0.491606,0.507843,506.0
75%,1004.0,0.732576,0.770225,754.0
max,1005.0,0.998767,0.999445,998.0


## Mock feature drift data

In [7]:
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler

In [8]:
N_SAMPLES = 100

X, y = make_classification(n_samples=N_SAMPLES, random_state=random_seed)
# Shift feature range
scaler = MinMaxScaler(feature_range=(0.5, 0.995))
X = scaler.fit_transform(X)

### Mock drift data

In [9]:
mock_df = pd.DataFrame()
mock_df['conv_rate'] = X[:, 0]
mock_df['acc_rate'] = X[:, 1]
mock_df['avg_daily_trips'] = np.array((X[:, 2] * 1000), dtype=int)
mock_df.describe()

Unnamed: 0,conv_rate,acc_rate,avg_daily_trips
count,100.0,100.0,100.0
mean,0.731518,0.701609,751.45
std,0.090697,0.095636,78.185402
min,0.5,0.5,500.0
25%,0.674897,0.627561,698.0
50%,0.728371,0.688873,749.0
75%,0.786183,0.771097,796.25
max,0.995,0.995,994.0


In [10]:
shuffled_orig_df = df_orig.sample(frac=1, random_state=random_seed)
mock_df = mock_df.assign(datetime=shuffled_orig_df['datetime'][:len(mock_df)].values)
mock_df = mock_df.assign(driver_id=shuffled_orig_df['driver_id'][:len(mock_df)].values)
mock_df = mock_df.assign(created=shuffled_orig_df['created'][:len(mock_df)].values)
mock_df['datetime'] = pd.to_datetime(mock_df['datetime'], utc=True)
mock_df

Unnamed: 0,conv_rate,acc_rate,avg_daily_trips,datetime,driver_id,created
0,0.562670,0.711753,817,2021-07-19 23:00:00+00:00,1003,2021-07-28 11:08:04.802
1,0.747795,0.729159,664,2021-07-18 06:00:00+00:00,1005,2021-07-28 11:08:04.802
2,0.577423,0.600396,800,2021-07-28 09:00:00+00:00,1003,2021-07-28 11:08:04.802
3,0.676030,0.587644,820,2021-07-27 10:00:00+00:00,1002,2021-07-28 11:08:04.802
4,0.867539,0.571839,754,2021-07-23 05:00:00+00:00,1001,2021-07-28 11:08:04.802
...,...,...,...,...,...,...
95,0.784332,0.550629,741,2021-07-20 09:00:00+00:00,1004,2021-07-28 11:08:04.802
96,0.682082,0.600372,752,2021-07-23 14:00:00+00:00,1001,2021-07-28 11:08:04.802
97,0.732227,0.874406,841,2021-07-24 12:00:00+00:00,1004,2021-07-28 11:08:04.802
98,0.768284,0.835585,769,2021-07-27 17:00:00+00:00,1003,2021-07-28 11:08:04.802


In [11]:
mock_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   conv_rate        100 non-null    float64            
 1   acc_rate         100 non-null    float64            
 2   avg_daily_trips  100 non-null    int64              
 3   datetime         100 non-null    datetime64[ns, UTC]
 4   driver_id        100 non-null    int64              
 5   created          100 non-null    datetime64[ns]     
dtypes: datetime64[ns, UTC](1), datetime64[ns](1), float64(2), int64(2)
memory usage: 4.8 KB


In [12]:
mock_df.describe()

Unnamed: 0,conv_rate,acc_rate,avg_daily_trips,driver_id
count,100.0,100.0,100.0,100.0
mean,0.731518,0.701609,751.45,1002.76
std,0.090697,0.095636,78.185402,1.341791
min,0.5,0.5,500.0,1001.0
25%,0.674897,0.627561,698.0,1001.75
50%,0.728371,0.688873,749.0,1003.0
75%,0.786183,0.771097,796.25,1004.0
max,0.995,0.995,994.0,1005.0


### Generate request df

Include `request_id`, `driver_ids`, `trip_completed_driver_id`

In [13]:
import itertools
import uuid
from collections import Counter

driver_ids_full = list(sorted(pd.unique(df_orig['driver_id'])))
driver_ids_perms = list(itertools.permutations(driver_ids_full, 4))
perms_len = len(driver_ids_perms)
request_id_list = []
driver_ids_list = []
trip_completed_driver_id_list = []

for i in range(N_SAMPLES):
    request_id = str(uuid.uuid1())
    request_id_list.append(request_id)
    driver_ids = driver_ids_perms[np.random.randint(perms_len, size=1)[0]]
    driver_ids_list.append(driver_ids)
    trip_completed_driver_id_list.append(np.random.choice(driver_ids, 1)[0])

print(f"unique uuid = {len(Counter(request_id_list).keys())}")

unique uuid = 100


In [14]:
request_df = pd.DataFrame()
request_df['request_id'] = request_id_list
request_df['driver_ids'] = driver_ids_list
request_df['trip_completed_driver_id'] = trip_completed_driver_id_list
request_df

Unnamed: 0,request_id,driver_ids,trip_completed_driver_id
0,483b6c9e-41ff-11ed-aeb2-fd09fb553743,"(1005, 1003, 1002, 1004)",1003
1,483b6c9f-41ff-11ed-aeb2-fd09fb553743,"(1001, 1004, 1003, 1005)",1003
2,483b6ca0-41ff-11ed-aeb2-fd09fb553743,"(1001, 1005, 1004, 1002)",1005
3,483b6ca1-41ff-11ed-aeb2-fd09fb553743,"(1005, 1003, 1004, 1001)",1003
4,483b6ca2-41ff-11ed-aeb2-fd09fb553743,"(1001, 1005, 1004, 1002)",1002
...,...,...,...
95,483b6cfd-41ff-11ed-aeb2-fd09fb553743,"(1002, 1005, 1001, 1004)",1002
96,483b6cfe-41ff-11ed-aeb2-fd09fb553743,"(1002, 1005, 1004, 1003)",1003
97,483b6cff-41ff-11ed-aeb2-fd09fb553743,"(1005, 1002, 1003, 1004)",1004
98,483b6d00-41ff-11ed-aeb2-fd09fb553743,"(1001, 1005, 1002, 1003)",1001


## Save mock data

In [15]:
OUTSIDE_DATA_DIR = Path("../data")
NORMAL_DATA_PATH = OUTSIDE_DATA_DIR / "mock_normal_data.parquet"
DRIFT_DATA_PATH = OUTSIDE_DATA_DIR / "mock_drift_data.parquet"
REQUEST_DATA_PATH = OUTSIDE_DATA_DIR / "mock_request_data.csv"

data_cols = ['datetime', 'driver_id', 'conv_rate', 'acc_rate', 'avg_daily_trips', 'created']
normal_df = df_orig[data_cols]
normal_df.to_parquet(NORMAL_DATA_PATH, engine="fastparquet")

mock_df[data_cols].to_parquet(DRIFT_DATA_PATH, engine="fastparquet")
request_df.to_csv(REQUEST_DATA_PATH, index=False)