In [1]:
import os
os.chdir("..")
!pwd

/Users/tung.dao/tung/mlopsvn/code/mlops-crash-course-code/training


In [2]:
import pandas as pd
import fastparquet
from pathlib import Path
import numpy as np

random_seed = 17
np.random.seed(random_seed)

## Load data

In [3]:
DATA_DIR = Path("./data")
TMP_DIR = Path("./tmp")
DATA_PATH = DATA_DIR / "driver_stats.parquet"
LABEL_PATH = DATA_DIR / "driver_orders.csv"
if not DATA_PATH.is_file():
    raise Exception("DATA_PATH not found")
if not LABEL_PATH.is_file():
    raise Exception("LABEL_PATH not found")

In [4]:
df_orig = pd.read_parquet(DATA_PATH, engine='fastparquet')
df_orig

Unnamed: 0,datetime,driver_id,conv_rate,acc_rate,avg_daily_trips,created
0,2021-07-13 11:00:00+00:00,1005,0.373837,0.154890,498,2021-07-28 11:08:04.802
1,2021-07-13 12:00:00+00:00,1005,0.571627,0.643958,656,2021-07-28 11:08:04.802
2,2021-07-13 13:00:00+00:00,1005,0.399909,0.993888,722,2021-07-28 11:08:04.802
3,2021-07-13 14:00:00+00:00,1005,0.967468,0.788458,424,2021-07-28 11:08:04.802
4,2021-07-13 15:00:00+00:00,1005,0.024679,0.956064,569,2021-07-28 11:08:04.802
...,...,...,...,...,...,...
1802,2021-07-28 09:00:00+00:00,1001,0.089418,0.311234,485,2021-07-28 11:08:04.802
1803,2021-07-28 10:00:00+00:00,1001,0.222534,0.927691,114,2021-07-28 11:08:04.802
1804,2021-04-12 07:00:00+00:00,1001,0.175219,0.761434,385,2021-07-28 11:08:04.802
902,2021-07-20 23:00:00+00:00,1003,0.025968,0.109748,55,2021-07-28 11:08:04.802


In [5]:
label_orig = pd.read_csv(LABEL_PATH, sep="\t")
label_orig

Unnamed: 0,event_timestamp,driver_id,trip_completed
0,2021-04-16 20:29:28+00:00,1001,1
1,2021-04-17 04:29:28+00:00,1002,0
2,2021-04-17 12:29:28+00:00,1003,0
3,2021-04-17 20:29:28+00:00,1001,1
4,2021-04-18 04:29:28+00:00,1002,0
5,2021-04-18 12:29:28+00:00,1003,0
6,2021-04-18 20:29:28+00:00,1001,1
7,2021-04-19 04:29:28+00:00,1002,0
8,2021-04-19 12:29:28+00:00,1003,0
9,2021-04-19 20:29:28+00:00,1004,1


### Format timestamp

In [6]:
label_orig["event_timestamp"] = pd.to_datetime(label_orig["event_timestamp"])
label_orig

Unnamed: 0,event_timestamp,driver_id,trip_completed
0,2021-04-16 20:29:28+00:00,1001,1
1,2021-04-17 04:29:28+00:00,1002,0
2,2021-04-17 12:29:28+00:00,1003,0
3,2021-04-17 20:29:28+00:00,1001,1
4,2021-04-18 04:29:28+00:00,1002,0
5,2021-04-18 12:29:28+00:00,1003,0
6,2021-04-18 20:29:28+00:00,1001,1
7,2021-04-19 04:29:28+00:00,1002,0
8,2021-04-19 12:29:28+00:00,1003,0
9,2021-04-19 20:29:28+00:00,1004,1


In [7]:
target_col = "trip_completed"

## Explore data

In [8]:
df_orig.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1807 entries, 0 to 902
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   datetime         1807 non-null   datetime64[ns, UTC]
 1   driver_id        1807 non-null   int64              
 2   conv_rate        1807 non-null   float64            
 3   acc_rate         1807 non-null   float64            
 4   avg_daily_trips  1807 non-null   int64              
 5   created          1807 non-null   datetime64[ns]     
dtypes: datetime64[ns, UTC](1), datetime64[ns](1), float64(2), int64(2)
memory usage: 98.8 KB


In [9]:
label_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   event_timestamp  10 non-null     datetime64[ns, UTC]
 1   driver_id        10 non-null     int64              
 2   trip_completed   10 non-null     int64              
dtypes: datetime64[ns, UTC](1), int64(2)
memory usage: 368.0 bytes


## Training

### Split data

In [10]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from mlflow.tracking import MlflowClient
import mlflow

In [11]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

def yield_artifacts(run_id, path=None):
    """Yield all artifacts in the specified run"""
    client = MlflowClient()
    for item in client.list_artifacts(run_id, path):
        if item.is_dir:
            yield from yield_artifacts(run_id, item.path)
        else:
            yield item.path

def fetch_logged_data(run_id):
    """Fetch params, metrics, tags, and artifacts in the specified run"""
    client = MlflowClient()
    data = client.get_run(run_id).data
    # Exclude system tags: https://www.mlflow.org/docs/latest/tracking.html#system-tags
    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = list(yield_artifacts(run_id))
    return {
        "params": data.params,
        "metrics": data.metrics,
        "tags": tags,
        "artifacts": artifacts,
    }

In [12]:
data_df = df_orig.merge(label_orig, left_on="driver_id", right_on="driver_id")
data_df = data_df[data_df.columns. \
    drop("datetime"). \
    drop("driver_id"). \
    drop("created"). \
    drop("event_timestamp")]
data_df

Unnamed: 0,conv_rate,acc_rate,avg_daily_trips,trip_completed
0,0.247582,0.795497,772,1
1,0.875381,0.025149,203,1
2,0.977574,0.408694,158,1
3,0.391113,0.092807,731,1
4,0.611149,0.144825,468,1
...,...,...,...,...
3611,0.222534,0.927691,114,1
3612,0.222534,0.927691,114,1
3613,0.175219,0.761434,385,1
3614,0.175219,0.761434,385,1


In [13]:
TEST_SIZE = 0.2
train, test = train_test_split(data_df, test_size=TEST_SIZE, random_state=random_seed)
train_x = train.drop([target_col], axis=1)
test_x = test.drop([target_col], axis=1)
train_y = train[[target_col]]
test_y = test[[target_col]]
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((2892, 3), (2892, 1), (724, 3), (724, 1))

### Training and evaluation

In [14]:
ALPHA = 0.5
L1_RATIO = 0.1

MLFLOW_TRACKING_URI = "http://localhost:5000"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
print((mlflow.get_tracking_uri(), mlflow.get_artifact_uri()))
mlflow.sklearn.autolog()

model = ElasticNet(alpha=ALPHA, l1_ratio=L1_RATIO, random_state=random_seed)
model.fit(train_x, train_y)

predicted_qualities = model.predict(test_x)
(rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (ALPHA, L1_RATIO))
print("  RMSE: %s" % rmse)
print("  MAE: %s" % mae)
print("  R2: %s" % r2)

mlflow.log_param("alpha", ALPHA)
mlflow.log_param("l1_ratio", L1_RATIO)
mlflow.log_metric("testing_rmse", rmse)
mlflow.log_metric("testing_r2", r2)
mlflow.log_metric("testing_mae", mae)
mlflow.sklearn.log_model(model, "model")
mlflow.end_run()

run_id = mlflow.last_active_run().info.run_id
print("Logged data and model in run {}".format(run_id))
for key, data in fetch_logged_data(run_id).items():
    print("\n---------- logged {} ----------".format(key))
    print(data)

('http://localhost:5000', 'mlflow-artifacts:/0/0b42ea038b494c02a59a2216a11a5c71/artifacts')




Elasticnet model (alpha=0.500000, l1_ratio=0.100000):
  RMSE: 0.4903734721027044
  MAE: 0.480175079686162
  R2: -0.00034586893532195795
Logged data and model in run 0b42ea038b494c02a59a2216a11a5c71

---------- logged params ----------
{'alpha': '0.5', 'copy_X': 'True', 'fit_intercept': 'True', 'l1_ratio': '0.1', 'max_iter': '1000', 'normalize': 'deprecated', 'positive': 'False', 'precompute': 'False', 'random_state': '17', 'selection': 'cyclic', 'tol': '0.0001', 'warm_start': 'False'}

---------- logged metrics ----------
{'training_mse': 0.23969650650486524, 'training_mae': 0.47939508605448006, 'training_r2_score': 0.00016256694638772107, 'training_rmse': 0.48958809881865517, 'training_score': 0.00016256694638772107, 'testing_rmse': 0.4903734721027044, 'testing_r2': -0.00034586893532195795, 'testing_mae': 0.480175079686162}

---------- logged tags ----------
{'estimator_name': 'ElasticNet', 'estimator_class': 'sklearn.linear_model._coordinate_descent.ElasticNet'}

---------- logged ar

### Export

In [15]:
from joblib import dump
MODEL_DIR = Path("./model")
dump(model, MODEL_DIR / "driver_model.bin")

['model/driver_model.bin']