In [7]:
# Q2: Metric Monitoring for NYC Taxi Data
# ---------------------------------------
# Goal: Add one metric of choice and a quantile value for "fare_amount" column (quantile=0.5)
# We use Evidently (latest version, v0.5+) for data quality monitoring

import pandas as pd
from evidently import Report
from evidently.metrics import QuantileValue, MissingValueCount
import json

# Step 1: Load March 2024 Green Taxi data
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet"
df = pd.read_parquet(url)

# Step 2: Compute trip duration and filter out unrealistic trips
df["duration"] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
df = df[(df.duration >= 1) & (df.duration <= 60)]

# Step 3: Split dataset into reference and current for monitoring
reference_data = df.sample(frac=0.5, random_state=42)
current_data = df.drop(reference_data.index)

# Step 4: Define metrics to monitor
# - MissingValueCount: tracks number of missing values in 'fare_amount'
# - QuantileValue: tracks median (0.5 quantile) of 'fare_amount'
report = Report(metrics=[
    MissingValueCount(column="fare_amount"),
    QuantileValue(column="fare_amount", quantile=0.5)
])

# Step 5: Run report to compute metrics
report.run(reference_data=reference_data, current_data=current_data)

# Step 6: Extract metrics programmatically
report_json_str = report.render_json()  # returns JSON string
metrics_dict = json.loads(report_json_str)

# Step 7: Get the metrics values
missing_values = metrics_dict["metrics"][0]["result"]["current"]["count"]
median_fare = metrics_dict["metrics"][1]["result"]["current"]["value"]

print(f"Missing values in fare_amount: {missing_values}")
print(f"Median fare_amount (quantile=0.5): {median_fare}")



invalid value encountered in scalar divide


invalid value encountered in scalar divide



AttributeError: 'Report' object has no attribute 'render_json'

In [8]:
import pandas as pd
from evidently import Report
from evidently.metrics import QuantileValue, MissingValueCount

# Load March 2024 Green Taxi data
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet"
df = pd.read_parquet(url)

# Compute duration and filter
df["duration"] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
df = df[(df.duration >= 1) & (df.duration <= 60)]

# Split reference & current
reference_data = df.sample(frac=0.5, random_state=42)
current_data = df.drop(reference_data.index)

# Define metrics
report = Report(metrics=[
    MissingValueCount(column="fare_amount"),
    QuantileValue(column="fare_amount", quantile=0.5)
])

# Run report
report.run(reference_data=reference_data, current_data=current_data)

# Access metrics via report.results
results = report.results  # this is a dictionary

# Extract metric values
missing_values = results["metrics"][0]["result"]["current"]["count"]
median_fare = results["metrics"][1]["result"]["current"]["value"]

print(f"Missing values in fare_amount: {missing_values}")
print(f"Median fare_amount (quantile=0.5): {median_fare}")



invalid value encountered in scalar divide


invalid value encountered in scalar divide



AttributeError: 'Report' object has no attribute 'results'

In [9]:
pip install evidently==0.4.35



SyntaxError: invalid syntax (2792120166.py, line 1)

In [10]:
!pip install evidently==0.4.35


Collecting evidently==0.4.35
  Downloading evidently-0.4.35-py3-none-any.whl.metadata (11 kB)
Collecting opentelemetry-api>=1.25.0 (from evidently==0.4.35)
  Using cached opentelemetry_api-1.39.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-sdk>=1.25.0 (from evidently==0.4.35)
  Using cached opentelemetry_sdk-1.39.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.25.0 (from evidently==0.4.35)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.39.1-py3-none-any.whl.metadata (2.5 kB)
Collecting opentelemetry-exporter-otlp-proto-http>=1.25.0 (from evidently==0.4.35)
  Downloading opentelemetry_exporter_otlp_proto_http-1.39.1-py3-none-any.whl.metadata (2.4 kB)


Collecting importlib-metadata<8.8.0,>=6.0 (from opentelemetry-api>=1.25.0->evidently==0.4.35)
  Using cached importlib_metadata-8.7.0-py3-none-any.whl.metadata (4.8 kB)
Collecting zipp>=3.20 (from importlib-metadata<8.8.0,>=6.0->opentelemetry-api>=1.25.0->evidently==0.4.35)
  Using cached zipp-3.23.0-py3-none-any.whl.metadata (3.6 kB)
Collecting googleapis-common-protos~=1.57 (from opentelemetry-exporter-otlp-proto-grpc>=1.25.0->evidently==0.4.35)
  Downloading googleapis_common_protos-1.72.0-py3-none-any.whl.metadata (9.4 kB)
Collecting grpcio<2.0.0,>=1.63.2 (from opentelemetry-exporter-otlp-proto-grpc>=1.25.0->evidently==0.4.35)
  Downloading grpcio-1.76.0-cp310-cp310-macosx_11_0_universal2.whl.metadata (3.7 kB)
Collecting opentelemetry-exporter-otlp-proto-common==1.39.1 (from opentelemetry-exporter-otlp-proto-grpc>=1.25.0->evidently==0.4.35)
  Downloading opentelemetry_exporter_otlp_proto_common-1.39.1-py3-none-any.whl.metadata (1.8 kB)
Collecting opentelemetry-semantic-conventions=

[2K    Uninstalling evidently-0.7.17:━━━━━━━━━━━━━━━━━━━━━━━[0m[38;5;237m╺[0m[38;5;237m━━━[0m [32m10/11[0m [evidently]api]
[2K      Successfully uninstalled evidently-0.7.17━━━━━━[0m[38;5;237m╺[0m[38;5;237m━━━[0m [32m10/11[0m [evidently]
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11/11[0m [evidently][0m [32m10/11[0m [evidently]
[1A[2KSuccessfully installed evidently-0.4.35 googleapis-common-protos-1.72.0 grpcio-1.76.0 importlib-metadata-8.7.0 opentelemetry-api-1.39.1 opentelemetry-exporter-otlp-proto-common-1.39.1 opentelemetry-exporter-otlp-proto-grpc-1.39.1 opentelemetry-exporter-otlp-proto-http-1.39.1 opentelemetry-sdk-1.39.1 opentelemetry-semantic-conventions-0.60b1 zipp-3.23.0


In [11]:
# Step 0: Make sure you have Evidently v0.4.35 installed
# pip install evidently==0.4.35

import pandas as pd
from evidently.report import Report
from evidently.metrics import ColumnQuantileMetric, DatasetMissingValuesMetric

# Step 1: Load March 2024 Green Taxi data
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet"
df = pd.read_parquet(url)

# Step 2: Compute duration and filter unrealistic trips
df["duration"] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
df = df[(df.duration >= 1) & (df.duration <= 60)]

# Step 3: Split into reference and current datasets
reference_data = df.sample(frac=0.5, random_state=42)
current_data = df.drop(reference_data.index)

# Step 4: Define metrics
# - DatasetMissingValuesMetric: counts missing values in fare_amount
# - ColumnQuantileMetric: computes quantile (median) of fare_amount
report = Report(metrics=[
    DatasetMissingValuesMetric(column_name="fare_amount"),
    ColumnQuantileMetric(column_name="fare_amount", quantile=0.5)
])

# Step 5: Run the report
report.run(reference_data=reference_data, current_data=current_data)

# Step 6: Extract metrics programmatically
results = report.as_dict()  # This works in Evidently v0.4.35

missing_values = results["metrics"][0]["result"]["current"]["count"]
median_fare = results["metrics"][1]["result"]["current"]["value"]

print(f"Missing values in fare_amount: {missing_values}")
print(f"Median fare_amount (quantile=0.5): {median_fare}")


ImportError: cannot import name 'BaseResult' from 'evidently.core' (/Users/nageswara_regula/Desktop/mlops-homework/04-deployment/homework/nbfix-env/lib/python3.10/site-packages/evidently/core/__init__.py)