In [1]:
!pwd

/Users/tung.dao/tung/mlopsvn/code/mlops-crash-course-code/monitoring_service/nbs


In [2]:
import pandas as pd
import fastparquet
from pathlib import Path
import numpy as np

random_seed = 17
np.random.seed(random_seed)

## Load data

In [3]:
OUTSIDE_DATA_DIR = Path("../data")
DATA_PATH = OUTSIDE_DATA_DIR / "orig_driver_stats.parquet"
if not DATA_PATH.is_file():
    raise Exception("DATA_PATH not found")

In [4]:
df_orig = pd.read_parquet(DATA_PATH, engine='fastparquet')
df_orig

Unnamed: 0,datetime,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2021-07-13 11:00:00+00:00,1005,0.373837,0.154890,498,2021-07-28 11:08:04.802
1,2021-07-13 12:00:00+00:00,1005,0.571627,0.643958,656,2021-07-28 11:08:04.802
2,2021-07-13 13:00:00+00:00,1005,0.399909,0.993888,722,2021-07-28 11:08:04.802
3,2021-07-13 14:00:00+00:00,1005,0.967468,0.788458,424,2021-07-28 11:08:04.802
4,2021-07-13 15:00:00+00:00,1005,0.024679,0.956064,569,2021-07-28 11:08:04.802
...,...,...,...,...,...,...
1802,2021-07-28 09:00:00+00:00,1001,0.089418,0.311234,485,2021-07-28 11:08:04.802
1803,2021-07-28 10:00:00+00:00,1001,0.222534,0.927691,114,2021-07-28 11:08:04.802
1804,2021-04-12 07:00:00+00:00,1001,0.175219,0.761434,385,2021-07-28 11:08:04.802
902,2021-07-20 23:00:00+00:00,1003,0.025968,0.109748,55,2021-07-28 11:08:04.802


In [5]:
df_orig.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1807 entries, 0 to 902
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   datetime         1807 non-null   datetime64[ns, UTC]
 1   driver_id        1807 non-null   int64              
 2   conv_rate        1807 non-null   float64            
 3   acc_rate         1807 non-null   float64            
 4   avg_daily_trips  1807 non-null   int64              
 5   created          1807 non-null   datetime64[ns]     
dtypes: datetime64[ns, UTC](1), datetime64[ns](1), float64(2), int64(2)
memory usage: 98.8 KB


In [6]:
desc_df = df_orig.describe()
desc_df

Unnamed: 0,driver_id,conv_rate,acc_rate,avg_daily_trips
count,1807.0,1807.0,1807.0,1807.0
mean,1003.0,0.488267,0.505205,500.871057
std,1.413822,0.291862,0.29123,293.412315
min,1001.0,0.000482,0.000542,0.0
25%,1002.0,0.238879,0.251682,236.0
50%,1003.0,0.491606,0.507843,506.0
75%,1004.0,0.732576,0.770225,754.0
max,1005.0,0.998767,0.999445,998.0


## Mock feature drift data

In [7]:
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler

In [8]:
driver_ids = np.unique(df_orig['driver_id'])
print(driver_ids)
N_SAMPLES = driver_ids.shape[0]

X, _ = make_classification(n_samples=N_SAMPLES, random_state=random_seed)
scaler = MinMaxScaler(feature_range=(0.05, 0.25))
X = scaler.fit_transform(X)
# Shift feature range
scaler = MinMaxScaler(feature_range=(0.75, 0.95))
X_shift = scaler.fit_transform(X)

[1001 1002 1003 1004 1005]


### Mock normal data

In [9]:
def create_dataset(generated_X):
    df = pd.DataFrame()
    df['conv_rate'] = generated_X[:, 0]
    df['acc_rate'] = generated_X[:, 1]
    df['avg_daily_trips'] = np.array((generated_X[:, 2] * 1000), dtype=int)
    return df

In [10]:
normal_df = create_dataset(X)
normal_df.describe()

Unnamed: 0,conv_rate,acc_rate,avg_daily_trips
count,5.0,5.0,5.0
mean,0.148341,0.189847,151.0
std,0.084737,0.08082,91.47404
min,0.05,0.05,49.0
25%,0.071032,0.192864,103.0
50%,0.184332,0.226879,107.0
75%,0.186341,0.22949,246.0
max,0.25,0.25,250.0


### Mock drift data

In [11]:
drift_df = create_dataset(X_shift)
drift_df.describe()

Unnamed: 0,conv_rate,acc_rate,avg_daily_trips
count,5.0,5.0,5.0
mean,0.848341,0.889847,851.2
std,0.084737,0.08082,91.195943
min,0.75,0.75,750.0
25%,0.771032,0.892864,803.0
50%,0.884332,0.926879,807.0
75%,0.886341,0.92949,946.0
max,0.95,0.95,950.0


### Post-format data

In [12]:
shuffled_orig_df = df_orig.sample(frac=1, random_state=random_seed)

def post_format_df(df):
    df = df.assign(driver_id=driver_ids)
    df = df.assign(created=shuffled_orig_df['created'][:len(df)].values)
    df = df.assign(datetime=shuffled_orig_df['datetime'][:len(df)].values)
    df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
    return df

normal_df = post_format_df(normal_df)
drift_df = post_format_df(drift_df)

In [13]:
normal_df

Unnamed: 0,conv_rate,acc_rate,avg_daily_trips,driver_id,created,datetime
0,0.186341,0.226879,107,1001,2021-07-28 11:08:04.802,2021-07-19 23:00:00+00:00
1,0.071032,0.22949,250,1002,2021-07-28 11:08:04.802,2021-07-18 06:00:00+00:00
2,0.05,0.192864,103,1003,2021-07-28 11:08:04.802,2021-07-28 09:00:00+00:00
3,0.184332,0.05,49,1004,2021-07-28 11:08:04.802,2021-07-27 10:00:00+00:00
4,0.25,0.25,246,1005,2021-07-28 11:08:04.802,2021-07-23 05:00:00+00:00


In [14]:
normal_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   conv_rate        5 non-null      float64            
 1   acc_rate         5 non-null      float64            
 2   avg_daily_trips  5 non-null      int64              
 3   driver_id        5 non-null      int64              
 4   created          5 non-null      datetime64[ns]     
 5   datetime         5 non-null      datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), datetime64[ns](1), float64(2), int64(2)
memory usage: 368.0 bytes


In [15]:
drift_df

Unnamed: 0,conv_rate,acc_rate,avg_daily_trips,driver_id,created,datetime
0,0.886341,0.926879,807,1001,2021-07-28 11:08:04.802,2021-07-19 23:00:00+00:00
1,0.771032,0.92949,950,1002,2021-07-28 11:08:04.802,2021-07-18 06:00:00+00:00
2,0.75,0.892864,803,1003,2021-07-28 11:08:04.802,2021-07-28 09:00:00+00:00
3,0.884332,0.75,750,1004,2021-07-28 11:08:04.802,2021-07-27 10:00:00+00:00
4,0.95,0.95,946,1005,2021-07-28 11:08:04.802,2021-07-23 05:00:00+00:00


In [16]:
drift_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   conv_rate        5 non-null      float64            
 1   acc_rate         5 non-null      float64            
 2   avg_daily_trips  5 non-null      int64              
 3   driver_id        5 non-null      int64              
 4   created          5 non-null      datetime64[ns]     
 5   datetime         5 non-null      datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), datetime64[ns](1), float64(2), int64(2)
memory usage: 368.0 bytes


### Mock target

In [17]:
y = np.random.choice([0, 1], size=N_SAMPLES, p=[0.3, 0.7])
y

array([0, 1, 0, 0, 1])

### Generate request df

Include `request_id`, `driver_ids`, `trip_completed_driver_id`

In [18]:
import itertools
from collections import Counter

request_id_list = []
driver_ids_list = []

for i in range(N_SAMPLES):
    request_id = f"uuid-{i}"
    request_id_list.append(request_id)
    driver_id = driver_ids[i % len(driver_ids)]
    driver_ids_list.append([driver_id])

print(f"unique uuid = {len(Counter(request_id_list).keys())}")

unique uuid = 5


In [19]:
request_df = pd.DataFrame()
request_df['request_id'] = request_id_list
request_df['driver_ids'] = driver_ids_list
request_df['trip_completed'] = y
request_df

Unnamed: 0,request_id,driver_ids,trip_completed
0,uuid-0,[1001],0
1,uuid-1,[1002],1
2,uuid-2,[1003],0
3,uuid-3,[1004],0
4,uuid-4,[1005],1


In [20]:
request_df.describe()

Unnamed: 0,trip_completed
count,5.0
mean,0.4
std,0.547723
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


## Save mock data

In [21]:
NORMAL_DATA_PATH = OUTSIDE_DATA_DIR / "mock_normal_data.parquet"
DRIFT_DATA_PATH = OUTSIDE_DATA_DIR / "mock_drift_data.parquet"
REQUEST_DATA_PATH = OUTSIDE_DATA_DIR / "mock_request_data.csv"

data_cols = ['datetime', 'driver_id', 'conv_rate', 'acc_rate', 'avg_daily_trips', 'created']
normal_df[data_cols].to_parquet(NORMAL_DATA_PATH, engine="fastparquet")

drift_df[data_cols].to_parquet(DRIFT_DATA_PATH, engine="fastparquet")
request_df.to_csv(REQUEST_DATA_PATH, index=False)