In [4]:
# Step 0: Install packages if needed
# In terminal:
# pip install pandas numpy scikit-learn xgboost evidently==0.4.35

# Step 1: Imports
import pandas as pd
from evidently.report import Report
from evidently.metrics import ColumnQuantileMetric, DatasetMissingValuesMetric

# -----------------------
# Q1: Prepare the dataset
# -----------------------

# Load March 2024 Green Taxi data
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet"
df = pd.read_parquet(url)

# Compute trip duration and filter out unrealistic trips
df["duration"] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
df = df[(df.duration >= 1) & (df.duration <= 60)]

# Show dataset shape
print(f"Dataset shape after filtering: {df.shape}")
print(f"Number of rows: {df.shape[0]}")

# -----------------------
# Q2: Metrics
# -----------------------

# Split data into reference and current datasets
reference_data = df.sample(frac=0.5, random_state=42)
current_data = df.drop(reference_data.index)

# Define Evidently metrics
report = Report(metrics=[
    DatasetMissingValuesMetric(column_name="fare_amount"),  # missing values metric
    ColumnQuantileMetric(column_name="fare_amount", quantile=0.5)  # median fare
])

# Run the report
report.run(reference_data=reference_data, current_data=current_data)

# Extract results programmatically
results = report.as_dict()

missing_values = results["metrics"][0]["result"]["current"]["count"]
median_fare = results["metrics"][1]["result"]["current"]["value"]

print(f"Missing values in fare_amount: {missing_values}")
print(f"Median fare_amount (quantile=0.5): {median_fare}")


Dataset shape after filtering: (55139, 21)
Number of rows: 55139


TypeError: DatasetMissingValuesMetric.__init__() got an unexpected keyword argument 'column_name'

In [2]:
pip install evidently==0.4.35


Collecting evidently==0.4.35
  Using cached evidently-0.4.35-py3-none-any.whl.metadata (11 kB)
Collecting plotly>=5.10.0 (from evidently==0.4.35)
  Using cached plotly-6.5.0-py3-none-any.whl.metadata (8.5 kB)
Collecting statsmodels>=0.12.2 (from evidently==0.4.35)
  Using cached statsmodels-0.14.6-cp310-cp310-macosx_11_0_arm64.whl.metadata (9.5 kB)
Collecting nltk>=3.6.7 (from evidently==0.4.35)
  Using cached nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pydantic>=1.10.13 (from evidently==0.4.35)
  Using cached pydantic-2.12.5-py3-none-any.whl.metadata (90 kB)
Collecting litestar>=2.8.3 (from evidently==0.4.35)
  Using cached litestar-2.19.0-py3-none-any.whl.metadata (26 kB)
Collecting typing-inspect>=0.9.0 (from evidently==0.4.35)
  Using cached typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting uvicorn>=0.22.0 (from uvicorn[standard]>=0.22.0->evidently==0.4.35)
  Using cached uvicorn-0.38.0-py3-none-any.whl.metadata (6.8 kB)
Collecting watchdog>=3.0.0 (fro

In [3]:
python -c "import evidently; print(evidently.__version__)"


SyntaxError: invalid syntax (3739093832.py, line 1)

In [8]:
# =====================================================
# Q1 + Q2: NYC Taxi Data Preparation and Metrics
# =====================================================

# Step 0: Imports
import pandas as pd
from evidently.report import Report
from evidently.metrics import ColumnQuantileMetric, DatasetMissingValuesMetric

# -----------------------
# Q1: Prepare the dataset
# -----------------------

# Load March 2024 Green Taxi data
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet"
df = pd.read_parquet(url)

# Compute trip duration and filter unrealistic trips
df["duration"] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
df = df[(df.duration >= 1) & (df.duration <= 60)]

print(f"Dataset shape after filtering: {df.shape}")
print(f"Number of rows: {df.shape[0]}")

# -----------------------
# Q2: Metrics
# -----------------------

# Split dataset into reference and current
reference_data = df.sample(frac=0.5, random_state=42)
current_data = df.drop(reference_data.index)

# Define Evidently metrics
metrics = [
    DatasetMissingValuesMetric(),  # track missing values for all columns
    ColumnQuantileMetric(column_name="fare_amount", quantile=0.5)  # median fare
]

report = Report(metrics=metrics)

# Run the report
report.run(reference_data=reference_data, current_data=current_data)

# -----------------------
# Extract metrics programmatically
# -----------------------

results = report.as_dict()

# Step 1: DatasetMissingValuesMetric
missing_values_result = results["metrics"][0]["result"]["current"]
# Extract missing values for 'fare_amount'
missing_values_fare_amount = missing_values_result.get("fare_amount", None)

# Step 2: ColumnQuantileMetric
median_fare = results["metrics"][1]["result"]["current"]["value"]

print(f"Missing values in fare_amount: {missing_values_fare_amount}")
print(f"Median fare_amount (0.5 quantile): {median_fare}")

# =====================================================
# ✅ Done! This notebook covers:
# - Q1: dataset shape and filtering
# - Q2: metrics monitoring (missing values + median fare)
# =====================================================


Dataset shape after filtering: (55139, 21)
Number of rows: 55139
Missing values in fare_amount: None
Median fare_amount (0.5 quantile): 13.5


In [9]:
# Step 1: Imports for Evidently metrics
from evidently.report import Report
from evidently.metrics import ColumnQuantileMetric, DatasetMissingValuesMetric

# Split dataset into reference and current
reference_data = df.sample(frac=0.5, random_state=42)
current_data = df.drop(reference_data.index)

# Define Evidently metrics
metrics = [
    DatasetMissingValuesMetric(),  # track missing values for all columns
    ColumnQuantileMetric(column_name="fare_amount", quantile=0.5)  # median fare
]

# Create and run the report
report = Report(metrics=metrics)
report.run(reference_data=reference_data, current_data=current_data)

# Extract metrics programmatically
results = report.as_dict()

# Missing values for 'fare_amount'
missing_values_result = results["metrics"][0]["result"]["current"]
missing_values_fare_amount = missing_values_result.get("fare_amount", None)

# Median fare (0.5 quantile)
median_fare = results["metrics"][1]["result"]["current"]["value"]

print(f"Missing values in fare_amount: {missing_values_fare_amount}")
print(f"Median fare_amount (0.5 quantile): {median_fare}")


Missing values in fare_amount: None
Median fare_amount (0.5 quantile): 13.5


In [10]:
import pandas as pd

url = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet"
df = pd.read_parquet(url)

# Compute trip duration and filter unrealistic trips
df["duration"] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
df = df[(df.duration >= 1) & (df.duration <= 60)]

# Convert pickup datetime to date for daily aggregation
df['pickup_date'] = df.lpep_pickup_datetime.dt.date


In [11]:
daily_median_fare = df.groupby('pickup_date')['fare_amount'].quantile(0.5)
print(daily_median_fare)


pickup_date
2024-02-23    20.000
2024-02-25    12.100
2024-02-29    34.000
2024-03-01    13.500
2024-03-02    13.500
2024-03-03    14.200
2024-03-04    12.800
2024-03-05    13.500
2024-03-06    12.800
2024-03-07    13.500
2024-03-08    13.185
2024-03-09    13.500
2024-03-10    14.200
2024-03-11    12.800
2024-03-12    13.500
2024-03-13    13.500
2024-03-14    14.200
2024-03-15    13.500
2024-03-16    13.500
2024-03-17    13.500
2024-03-18    13.500
2024-03-19    13.500
2024-03-20    13.430
2024-03-21    13.500
2024-03-22    13.500
2024-03-23    12.800
2024-03-24    14.200
2024-03-25    13.500
2024-03-26    13.500
2024-03-27    13.500
2024-03-28    13.500
2024-03-29    13.460
2024-03-30    14.200
2024-03-31    13.500
2024-04-01     5.800
Name: fare_amount, dtype: float64


In [12]:
max_daily_median = daily_median_fare.max()
print(f"Maximum daily median fare (quantile=0.5): {max_daily_median}")


Maximum daily median fare (quantile=0.5): 34.0
