# Baseline model for batch monitoring example

In [None]:
import requests
import datetime
import pandas as pd

# from evidently import ColumnMapping
# from evidently.report import Report
# from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [2]:
files = [
    # ('green_tripdata_2022-02.parquet', './data'),
    # ('green_tripdata_2022-01.parquet', './data'),
    ('green_tripdata_2023-03.parquet', './data')]

print("Download files:")
for file, path in files:
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp = requests.get(url, stream=True)
    save_path = f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                         desc=f"{file}",
                         postfix=f"save to {save_path}",
                         total=int(resp.headers["Content-Length"])):
            handle.write(data)

green_tripdata_2023-03.parquet: 100%|██████████| 1730999/1730999 [00:04<00:00, 415213.05it/s, save to ./data/green_tripdata_2023-03.parquet]


In [3]:
march_2023_data = pd.read_parquet('data/green_tripdata_2023-03.parquet')

In [5]:
march_2023_data.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2023-03-01 00:25:10,2023-03-01 00:35:47,N,1.0,82,196,1.0,2.36,13.5,1.0,0.5,0.0,0.0,,1.0,16.0,2.0,1.0,0.0
1,2,2023-03-01 00:14:29,2023-03-01 00:25:04,N,1.0,7,7,1.0,0.78,-6.5,-1.0,-0.5,0.0,0.0,,-1.0,-9.0,3.0,1.0,0.0
2,2,2023-03-01 00:14:29,2023-03-01 00:25:04,N,1.0,7,7,1.0,0.78,6.5,1.0,0.5,0.0,0.0,,1.0,9.0,3.0,1.0,0.0
3,2,2023-02-28 22:59:46,2023-02-28 23:08:38,N,1.0,166,74,1.0,1.66,11.4,1.0,0.5,2.78,0.0,,1.0,16.68,1.0,1.0,0.0
4,2,2023-03-01 00:54:03,2023-03-01 01:03:14,N,1.0,236,229,1.0,3.14,15.6,1.0,0.5,4.17,0.0,,1.0,25.02,1.0,1.0,2.75


In [4]:
len(march_2023_data)

72044

In [None]:
jan_data = pd.read_parquet('data/green_tripdata_2022-01.parquet')

In [None]:
jan_data.describe()

In [None]:
jan_data.shape

In [None]:
# create target
jan_data["duration_min"] = jan_data.lpep_dropoff_datetime - jan_data.lpep_pickup_datetime
jan_data.duration_min = jan_data.duration_min.apply(lambda td : float(td.total_seconds())/60)

In [None]:
# filter out outliers
jan_data = jan_data[(jan_data.duration_min >= 0) & (jan_data.duration_min <= 60)]
jan_data = jan_data[(jan_data.passenger_count > 0) & (jan_data.passenger_count <= 8)]

In [None]:
jan_data.duration_min.hist()

In [None]:
# data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

In [None]:
jan_data.shape

In [None]:
train_data = jan_data[:30000].copy()
val_data = jan_data[30000:].copy()

In [None]:
model = LinearRegression()

In [None]:
model.fit(train_data[num_features + cat_features], train_data[target])

In [None]:
train_preds = model.predict(train_data[num_features + cat_features])
train_data['prediction'] = train_preds

In [None]:
val_preds = model.predict(val_data[num_features + cat_features])
val_data['prediction'] = val_preds

In [None]:
print(mean_absolute_error(train_data.duration_min, train_data.prediction))
print(mean_absolute_error(val_data.duration_min, val_data.prediction))

# Dump model and reference data

In [None]:
with open('models/lin_reg.bin', 'wb') as f_out:
    dump(model, f_out)

In [None]:
val_data.head()

In [None]:
val_data.to_parquet('data/reference.parquet')

# Evidently Report

In [None]:
column_mapping = ColumnMapping(
    target=None,
    prediction='prediction',
    numerical_features=num_features,
    categorical_features=cat_features
)

In [None]:
report = Report(metrics=[
    ColumnDriftMetric(column_name='prediction'),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric()
]
)

In [None]:
report.run(reference_data=train_data, current_data=val_data, column_mapping=column_mapping)

In [None]:
report.show(mode='inline')

In [None]:
result = report.as_dict()

In [None]:
result

In [None]:
#prediction drift
result['metrics'][0]['result']['drift_score']

In [None]:
#number of drifted columns
result['metrics'][1]['result']['number_of_drifted_columns']

In [None]:
#share of missing values
result['metrics'][2]['result']['current']['share_of_missing_values']