In [48]:
import requests
import datetime
import pandas as pd
import os

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric, DatasetCorrelationsMetric, ColumnQuantileMetric

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [36]:


# Download March 2024 Green Taxi data
print("Downloading March 2024 Green Taxi data...")
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet"

try:
    response = requests.get(url)
    with open('green_tripdata_2024-03.parquet', 'wb') as f:
        f.write(response.content)
    print(" Data downloaded successfully!")
    
    # Load and examine the data
    df = pd.read_parquet('green_tripdata_2024-03.parquet')
    print(f"\n Dataset Info:")
    print(f"Shape: {df.shape}")
    print(f"Number of rows: {df.shape[0]}")
    print(f"Number of columns: {df.shape[1]}")
    
    print(f"\n Column names:")
    print(df.columns.tolist())
    
    print(f"\n First few rows:")
    print(df.head())
    
    print(f"\n Fare amount statistics:")
    print(df['fare_amount'].describe())
    
except Exception as e:
    print(f" Error downloading data: {e}")

Downloading March 2024 Green Taxi data...
 Data downloaded successfully!

 Dataset Info:
Shape: (57457, 20)
Number of rows: 57457
Number of columns: 20

 Column names:
['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID', 'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge', 'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge']

 First few rows:
   VendorID lpep_pickup_datetime lpep_dropoff_datetime store_and_fwd_flag  \
0         2  2024-03-01 00:10:52   2024-03-01 00:26:12                  N   
1         2  2024-03-01 00:22:21   2024-03-01 00:35:15                  N   
2         2  2024-03-01 00:45:27   2024-03-01 01:04:32                  N   
3         1  2024-03-01 00:02:00   2024-03-01 00:23:45                  N   
4         2  2024-03-01 00:16:45   2024-03-01 00:23:25                  N   

   RatecodeID

 Prepare the data for training

In [37]:
# Load the March data if not already loaded
march_data = pd.read_parquet('green_tripdata_2024-03.parquet')

# Create the target column (trip duration in minutes)
march_data["duration_min"] = march_data.lpep_dropoff_datetime - march_data.lpep_pickup_datetime
march_data["duration_min"] = march_data["duration_min"].apply(lambda td: td.total_seconds() / 60)

# Filter outliers
march_data = march_data[(march_data["duration_min"] >= 0) & (march_data["duration_min"] <= 60)]
march_data = march_data[(march_data["passenger_count"] > 0) & (march_data["passenger_count"] <= 8)]

# Define target and feature columns
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

print(" Data is ready for model training.")
print(" Features:", num_features + cat_features)
print(" Target:", target)
print(" Shape:", march_data.shape)

 Data is ready for model training.
 Features: ['passenger_count', 'trip_distance', 'fare_amount', 'total_amount', 'PULocationID', 'DOLocationID']
 Target: duration_min
 Shape: (54135, 21)


Train a simple model

In [None]:


# Split data into training and validation sets
train_data = march_data[:30000].copy()
val_data = march_data[30000:].copy()

# Initialize and train the model
model = LinearRegression()
model.fit(train_data[num_features + cat_features], train_data[target])

print("Model training complete.")

#   Generate predictions
train_preds = model.predict(train_data[num_features + cat_features])
train_data['prediction'] = train_preds

val_preds = model.predict(val_data[num_features + cat_features])
val_data['prediction'] = val_preds

print(" Predictions added to both train and validation datasets.")


Model training complete.
 Predictions added to both train and validation datasets.


Dump the Model and Reference Data for Monitoring

In [39]:
import os
from joblib import dump

# Create directories if they don't exist
os.makedirs("models", exist_ok=True)
os.makedirs("data", exist_ok=True)

#  Save the trained model
with open('models/lin_reg.bin', 'wb') as f_out:
    dump(model, f_out)
print(" Model saved to 'models/lin_reg.bin'.")

#  Save the validation data as reference dataset
val_data.to_parquet('data/reference.parquet')
print(" Reference data saved to 'data/reference.parquet'.")


 Model saved to 'models/lin_reg.bin'.
 Reference data saved to 'data/reference.parquet'.


 Generate and Show Evidently Report

In [40]:
from evidently.report import Report
from evidently import ColumnMapping
from evidently.metrics import (
    ColumnDriftMetric,
    ColumnQuantileMetric,
    DatasetDriftMetric,
    DatasetMissingValuesMetric,
    DatasetCorrelationsMetric
)

# Define column mapping
column_mapping = ColumnMapping(
    target=None,
    prediction='prediction',
    numerical_features=num_features,
    categorical_features=cat_features
)

#  Define report with expanded metrics
report = Report(metrics=[
    ColumnDriftMetric(column_name='prediction'),
    ColumnQuantileMetric(column_name='fare_amount', quantile=0.5),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric(),
    DatasetCorrelationsMetric()
])

#  Run the report
report.run(reference_data=train_data, current_data=val_data, column_mapping=column_mapping)

#  Show the report inline (if running in Jupyter)
report.show(mode='inline')

#  Extract results
result = report.as_dict()

# Print individual metric results
print("Prediction Drift Score:", result['metrics'][0]['result']['drift_score'])
print("Number of Drifted Columns:", result['metrics'][2]['result']['number_of_drifted_columns'])
print("Share of Missing Values:", result['metrics'][3]['result']['current']['share_of_missing_values'])
print("Fare Amount Quantile (0.5):")
print("  Reference:", result['metrics'][1]['result']['reference']['value'])
print("  Current:", result['metrics'][1]['result']['current']['value'])

Prediction Drift Score: 0.01006459378009672
Number of Drifted Columns: 0
Share of Missing Values: 0.045458312145695616
Fare Amount Quantile (0.5):
  Reference: 13.5
  Current: 13.5


Expanded Evidently Monitoring with Daily Batches

In [41]:
from datetime import datetime, timedelta
import pandas as pd
from evidently.report import Report
from evidently import ColumnMapping
from evidently.metrics import ColumnQuantileMetric

#  Load March 2024 data
march_data = pd.read_parquet('green_tripdata_2024-03.parquet')


# create target
march_data["duration_min"] = march_data.lpep_dropoff_datetime - march_data.lpep_pickup_datetime
march_data.duration_min = march_data.duration_min.apply(lambda td : float(td.total_seconds())/60)



In [42]:
# data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

Evidently Workspace Script

In [43]:
from evidently.metric_preset import DataDriftPreset, DataQualityPreset

from evidently.ui.workspace import Workspace
from evidently.ui.dashboards import DashboardPanelCounter, DashboardPanelPlot, CounterAgg, PanelValue, PlotType, ReportFilter
from evidently.renderers.html_widgets import WidgetSize

In [44]:
ws = Workspace("workspace")

In [45]:
project = ws.create_project("NYC Taxi March 2024 Evaluation Reports")
project.description = "My project Fare Amount measure"
project.save()

Project(id=UUID('972ee783-2fc5-4fca-9f25-52692c87247a'), name='NYC Taxi March 2024 Evaluation Reports', description='My project Fare Amount measure', dashboard=DashboardConfig(name='NYC Taxi March 2024 Evaluation Reports', panels=[], tabs=[], tab_id_to_panel_ids={}), team_id=None, date_from=None, date_to=None)

In [46]:
def add_report(metrics, column_mapping, curr_data, ref_data, start_date, end_date, project_id, ws):
    # Generate tuples with start and end date for every day in 
    dates = [ (start_date + datetime.timedelta(n), start_date + datetime.timedelta(n+1))  
             for n in range(int((end_date - start_date).days))]
    for start,end in dates:
        # DEfine the report
        regular_report = Report(
            metrics=metrics,
            timestamp=start
        )
        #curr_data_filtered= curr_data.lpep_pickup_datetime.between(pd.Timestamp(start),pd.Timestamp(end), inclusive="left")
        curr_data_filtered= curr_data.lpep_pickup_datetime.between(start.strftime('%Y-%m-%d'),end.strftime('%Y-%m-%d'), inclusive="left")
        if curr_data_filtered.any():
            # run the report
            regular_report.run(reference_data=ref_data,
                            current_data=curr_data.loc[curr_data.lpep_pickup_datetime.between(start.strftime('%Y-%m-%d'),end.strftime('%Y-%m-%d'), 
                                                                                             inclusive="left")],
                            column_mapping=column_mapping)
    
            # Add the report to the workspace
            ws.add_report(project_id, regular_report)
            #print("Report executed")
        else:
           regular_report=None 

    return regular_report

In [49]:
metrics=[
        DataQualityPreset(),
        ColumnQuantileMetric(column_name='fare_amount', quantile=0.5),
]
curr_data=march_data
ref_data=None
start_date = datetime.datetime(2024, 3, 1)
end_date   = datetime.datetime(2024, 3, 31)
project_id= project.id

result_test = add_report(metrics, column_mapping, curr_data, ref_data, start_date, end_date, project_id, ws)


'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, please use 'h' instead.


'H' is deprecated and will be removed in a future version, pleas

In [50]:
project.save()

Project(id=UUID('972ee783-2fc5-4fca-9f25-52692c87247a'), name='NYC Taxi March 2024 Evaluation Reports', description='My project Fare Amount measure', dashboard=DashboardConfig(name='NYC Taxi March 2024 Evaluation Reports', panels=[], tabs=[], tab_id_to_panel_ids={}), team_id=None, date_from=None, date_to=None)

Define & Save the Dashboard

In [51]:
#configure the dashboard
project.dashboard.add_panel(
    DashboardPanelCounter(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        agg=CounterAgg.NONE,
        title="NYC taxi March 2024 Dashboard"
    )
)

project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Fare Amount Quantile Count",
        values=[
            PanelValue(
                metric_id="ColumnQuantileMetric",
                field_path="current.value",
                legend="count"
            ),
        ],
        plot_type=PlotType.BAR,
        size=WidgetSize.HALF,
    ),
)

In [52]:
project.save()

Project(id=UUID('972ee783-2fc5-4fca-9f25-52692c87247a'), name='NYC Taxi March 2024 Evaluation Reports', description='My project Fare Amount measure', dashboard=DashboardConfig(name='NYC Taxi March 2024 Evaluation Reports', panels=[DashboardPanelCounter(type='evidently.ui.dashboards.reports.DashboardPanelCounter', id=UUID('fb319a4d-a273-47de-ac29-0d263cb31384'), title='NYC taxi March 2024 Dashboard', filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=False), size=<WidgetSize.FULL: 2>, agg=<CounterAgg.NONE: 'none'>, value=None, text=None), DashboardPanelPlot(type='evidently.ui.dashboards.reports.DashboardPanelPlot', id=UUID('1482276b-d94b-406e-b6f0-09f020988557'), title='Fare Amount Quantile Count', filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=False), size=<WidgetSize.HALF: 1>, values=[PanelValue(field_path='current.value', metric_id='ColumnQuantileMetric', metric_hash=None, metric_args={}, legend='count')], plot_type=<PlotType.BAR: '

In [53]:
project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Inference Count",
        values=[
            PanelValue(
                metric_id="DatasetSummaryMetric",
                field_path="current.number_of_rows",
                legend="count"
            ),
        ],
        plot_type=PlotType.BAR,
        size=WidgetSize.HALF,
    ),
)

project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Number of Missing Values",
        values=[
            PanelValue(
                metric_id="DatasetSummaryMetric",
                field_path="current.number_of_missing_values",
                legend="count"
            ),
        ],
        plot_type=PlotType.LINE,
        size=WidgetSize.HALF,
    ),
)

In [54]:
project.save()

Project(id=UUID('972ee783-2fc5-4fca-9f25-52692c87247a'), name='NYC Taxi March 2024 Evaluation Reports', description='My project Fare Amount measure', dashboard=DashboardConfig(name='NYC Taxi March 2024 Evaluation Reports', panels=[DashboardPanelCounter(type='evidently.ui.dashboards.reports.DashboardPanelCounter', id=UUID('fb319a4d-a273-47de-ac29-0d263cb31384'), title='NYC taxi March 2024 Dashboard', filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=False), size=<WidgetSize.FULL: 2>, agg=<CounterAgg.NONE: 'none'>, value=None, text=None), DashboardPanelPlot(type='evidently.ui.dashboards.reports.DashboardPanelPlot', id=UUID('1482276b-d94b-406e-b6f0-09f020988557'), title='Fare Amount Quantile Count', filter=ReportFilter(metadata_values={}, tag_values=[], include_test_suites=False), size=<WidgetSize.HALF: 1>, values=[PanelValue(field_path='current.value', metric_id='ColumnQuantileMetric', metric_hash=None, metric_args={}, legend='count')], plot_type=<PlotType.BAR: '