In [None]:
import pandas as pd
import mlflow
from mlflow.tracking import MlflowClient
import pickle
import datetime

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric
from evidently.metric_preset import DataDriftPreset, DataQualityPreset

In [None]:
def process_dataframe(path):
    df = pd.read_parquet(path, engine="pyarrow")
    df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])
    df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])

    df["duration"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60

    df = df[(df["duration"] >= 1) & (df["duration"] <= 60)]

    categorical = ["PULocationID", "DOLocationID"]
    df[categorical] = df[categorical].astype(str)

    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    
    return df

In [None]:
def load_model():
    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    client = MlflowClient("http://127.0.0.1:5000")
    # print(mlflow.get_tracking_uri()) 

    experiment_id = client.get_experiment_by_name("experiment-1").experiment_id
    runs = client.search_runs(experiment_ids=[experiment_id], order_by=["start_time desc"], max_results=1)
    run_id = runs[0].info.run_id
    # print(run_id)

    # model = mlflow.sklearn.load_model(f"runs:/{run_id}/model")
    model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")

    artifact_uri = f"runs:/{run_id}/artifacts/dv.pkl"
    dv_path = mlflow.artifacts.download_artifacts(artifact_uri=artifact_uri)
    with open(dv_path, "rb") as f_in:
        dv = pickle.load(f_in)

    return model, dv

In [None]:
def predict_durations(df, model, dv):
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    target = 'duration'

    preds =  model.predict(dv.transform(df[categorical + numerical].to_dict(orient='records')))
    df["prediction"] = preds
    
    return df

In [None]:
train_data = process_dataframe("../data/yellow_tripdata_2024-11.parquet")

In [None]:
val_data = process_dataframe("../data/yellow_tripdata_2024-12.parquet")

In [None]:
model, dv = load_model()

In [None]:
train_data = predict_durations(train, model, dv)

In [None]:
val_data = predict_durations(val, model, dv)

In [None]:
column_mapping = ColumnMapping(
    target=None,
    prediction='prediction',
    numerical_features=['trip_distance'],
    categorical_features=['PU_DO']
)

In [None]:
report = Report(metrics=[
        ColumnDriftMetric(column_name='prediction'),
        DatasetDriftMetric(),
        DatasetMissingValuesMetric()
    ]
)

In [None]:
report.run(reference_data=train_data, current_data=val_data, column_mapping=column_mapping)

In [None]:
result = report.as_dict()

In [None]:
result

In [None]:
# report.show(mode='inline')

In [None]:
from evidently.ui.workspace import Workspace
from evidently.ui.dashboards import DashboardPanelCounter, DashboardPanelPlot, CounterAgg, PanelValue, PlotType, ReportFilter
from evidently.renderers.html_widgets import WidgetSize

In [None]:
ws = Workspace("workspace")
project = ws.create_project("NYC Taxi Data Quality Report")
project.description = "NYC Taxi Data Quality Report"
project.save()
ws.add_report(project.id, report)

In [None]:
data_quality_report = Report(
    metrics=[
        DataQualityPreset()
    ],
    timestamp=datetime.datetime(2022,1,28)
)

data_quality_report.run(reference_data=None,
                  current_data=val_data,
                  column_mapping=column_mapping)

In [None]:
ws.add_report(project.id, regular_report)

In [None]:
project.dashboard.add_panel(
    DashboardPanelCounter(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        agg=CounterAgg.NONE,
        title="NYC taxi data dashboard"
    )
)

In [None]:
project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Number of Missing Values",
        values=[
            PanelValue(
                metric_id="DatasetMissingValuesMetric",
                field_path="current.number_of_missing_values",
                legend="Missing Values"
            ),
        ],
        plot_type=PlotType.BAR,
        size=WidgetSize.HALF,
    ),
)

In [None]:
project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Inference Count",
        values=[
            PanelValue(
                metric_id="DatasetSummaryMetric",
                field_path="current.number_of_rows",
                legend="count"
            ),
        ],
        plot_type=PlotType.BAR,
        size=WidgetSize.HALF,
    ),
)

In [None]:
project.save()

In [None]:
project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Dataset Drift Over Time",
        values=[
            PanelValue(
                metric_id="DatasetDriftMetric",
                field_path="share_of_drifted_columns",
                legend="Drifted Columns (%)"
            ),
        ],
        plot_type=PlotType.LINE,
        size=WidgetSize.HALF,
    ),
)

In [None]:
for panel in project.dashboard.panels:
    print(f"Panel ID: {panel.id}, Title: {panel.title}")

In [None]:
project.dashboard.remove_panel(panel_id="019545bd-d90f-773b-b6ef-152c98ea8386")
project.dashboard.remove_panel(panel_id="019545be-f3ef-7ba1-8832-756f414c1223")
project.dashboard.remove_panel(panel_id="019545c0-98e5-719f-9dfa-03f205717a51")