In [None]:
# Uncomment if you are runnning this on Google Colab
# !pip install nannyml
# !pip install numpy==1.22

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor, plot_importance

import nannyml as nml

In [None]:
# Read data from url
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2016-12.parquet"
columns = ['lpep_pickup_datetime', 'PULocationID', 'DOLocationID', 'trip_distance', 'VendorID', 'payment_type', 'fare_amount', 'tip_amount']
data = pd.read_parquet(url, columns=columns)

In [None]:
print(data.head(3).to_markdown(tablefmt="grid"))

In [None]:
# Choose only payments from Credit Cards
data = data.loc[data['payment_type'] == 1,].drop(columns='payment_type') # Credit card
# Choose only positive tip amounts
data = data[data['tip_amount'] >= 0]

# Sort data by pick up date
data = data.sort_values('lpep_pickup_datetime').reset_index(drop=True)
# Flag categoric columns as categoric
categoric_columns = ['PULocationID', 'DOLocationID', 'VendorID']
data[categoric_columns] = data[categoric_columns].astype('category')

# Create column with pick up time
data['pickup_time'] = data['lpep_pickup_datetime'].dt.hour

In [None]:
# Create data partition
data['partition'] = pd.cut(
    data['lpep_pickup_datetime'],
    bins= [pd.to_datetime('2016-12-01'),
           pd.to_datetime('2016-12-08'),
           pd.to_datetime('2016-12-16'),
           pd.to_datetime('2017-01-01')],
    right=False,
    labels= ['train', 'test', 'prod']
)

In [None]:
# Set target and features
target = 'tip_amount'
features = [col for col in data.columns if col not in [target, 'lpep_pickup_datetime', 'partition']]

# Split the data
X_train = data.loc[data['partition'] == 'train', features]
y_train = data.loc[data['partition'] == 'train', target]

X_test = data.loc[data['partition'] == 'test', features]
y_test = data.loc[data['partition'] == 'test', target]

X_prod = data.loc[data['partition'] == 'prod', features]
y_prod = data.loc[data['partition'] == 'prod', target]

In [None]:
display(y_train.describe().to_frame())

In [None]:
y_train.plot(kind='box')
plt.savefig('../_static/example_green_taxi_tip_amount_boxplot.svg', format='svg')
plt.show()

y_train.clip(lower=0, upper=y_train.quantile(0.8)).to_frame().hist()
plt.savefig('../_static/example_green_taxi_tip_amount_distribution.svg', format='svg')
plt.show()

In [None]:
# Fit the model
model = LGBMRegressor(random_state=111)
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [None]:
# Make baseline predictions
y_pred_train_baseline = np.ones_like(y_train) * y_train.mean()
y_pred_test_baseline = np.ones_like(y_test) * y_train.mean()

# Measure train, test and baseline performance
mae_train = mean_absolute_error(y_train, y_pred_train).round(4)
mae_test = mean_absolute_error(y_test, y_pred_test).round(4)

mae_train_baseline = mean_absolute_error(y_train, y_pred_train_baseline).round(4)
mae_test_baseline = mean_absolute_error(y_test, y_pred_test_baseline).round(4)

In [None]:
# Create performance report
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12,4))

title1 = 'Train MAE: {} (<> {})'.format(mae_train, mae_train_baseline)
ax1.set(title=title1, xlabel='y_train', ylabel='y_pred')
ax1.plot(y_train, y_train, color='red', linestyle=':')
ax1.scatter(y_train, y_pred_train, alpha=0.1)

title2 = 'Test MAE: {} (<> {})'.format(mae_test, mae_test_baseline)
ax2.set(title=title2, xlabel='y_test', ylabel='y_pred')
ax2.plot(y_test, y_test, color='red', linestyle=':')
ax2.scatter(y_test, y_pred_test, alpha=0.1)

plt.show()


In [None]:
# plot the feature importance
fig, ax = plt.subplots()
plot_importance(model, ax=ax)
plt.savefig('../_static/example_green_taxi_feature_importance.svg', format='svg')
plt.show()

In [None]:
y_pred_prod = model.predict(X_prod)

In [None]:
reference = X_test.copy() # using the test set as a reference
reference['y_pred'] = y_pred_test # reference predictions
reference['tip_amount'] = y_test # ground truth (currect targets)
reference = reference.join(data['lpep_pickup_datetime']) # date

analysis = X_prod.copy() # features
analysis['y_pred'] = y_pred_prod # prod predictions
analysis = analysis.join(data['lpep_pickup_datetime']) # date

In [None]:
dle = nml.DLE(
    metrics=['mae'],
    y_true='tip_amount',
    y_pred='y_pred',
    feature_column_names=features,
    timestamp_column_name='lpep_pickup_datetime',
    chunk_period='d' # perform an estimation daily
)

dle.fit(reference) # fit on the reference (test) data
estimated_performance = dle.estimate(analysis) # estimate on the prod data

In [None]:
figure = estimated_performance.plot()
figure.write_image(f'../_static/example_green_taxi_dle.svg')

In [None]:
drdc = nml.DataReconstructionDriftCalculator(
    column_names=features,
    timestamp_column_name='lpep_pickup_datetime',
    chunk_period='d',
)

drdc.fit(reference)
multivariate_data_drift = drdc.calculate(analysis)

In [None]:
figure = multivariate_data_drift.plot()
figure.write_image(f'../_static/example_green_taxi_pca_error.svg')

In [None]:
udc = nml.UnivariateDriftCalculator(
    column_names=features,
    timestamp_column_name='lpep_pickup_datetime',
    chunk_period='d',
)

udc.fit(reference)
univariate_data_drift = udc.calculate(analysis)

In [None]:
figure = univariate_data_drift.filter(period='all', metrics='jensen_shannon', column_names=['DOLocationID']).plot(kind='distribution')
figure.write_image(f'../_static/example_green_taxi_location_udc.svg')

In [None]:
figure = univariate_data_drift.filter(period='all', metrics='jensen_shannon', column_names=['pickup_time']).plot(kind='distribution')
figure.write_image(f'../_static/example_green_taxi_pickup_udc.svg')

In [None]:
figure = univariate_data_drift.filter(period='all', metrics='jensen_shannon').plot(kind='distribution')

figure.write_image(f'../_static/example_green_taxi_all_udc.svg')


In [None]:
perfc = nml.PerformanceCalculator(
    metrics=['mae'],
    y_true='tip_amount',
    y_pred='y_pred',
    problem_type='regression',
    timestamp_column_name='lpep_pickup_datetime',
    chunk_period='d'
)

perfc.fit(reference)
realized_performance = perfc.calculate(analysis.assign(tip_amount = y_prod))

figure = estimated_performance.filter(period='analysis').compare(realized_performance).plot()
figure.write_image(f'../_static/example_green_taxi_dle_vs_realized.svg')