In [0]:
%run ../utils/validation

In [0]:
import pytest
from pyspark.sql.functions import *

In [0]:
df = spark.table("nyc_taxi.silver.data")

In [0]:
df.printSchema()

In [0]:
def test_no_nulls_in_required_columns():
    null_counts_df = null_check(df)
    null_counts = null_counts_df.collect()[0].asDict()
    required_cols = [
        "vendor_id", "pickup_datetime", "dropoff_datetime",
        "passenger_count", "trip_distance", "fare_amount", "pickup_month", "trip_id"
    ]
    for col_name in required_cols:
        assert null_counts[f"{col_name}_null_count"] == 0, f"{col_name} has nulls!"
    print("test_no_nulls_in_required_columns passed successfully!")
test_no_nulls_in_required_columns()

In [0]:
def test_no_duplicate_trip_ids():
    dup_df = check_duplicates(df, ["trip_id"])
    assert dup_df.count() == 0, "Duplicate trip_id values found!"
    print("test_no_duplicate_trip_ids passed successfully!")
test_no_duplicate_trip_ids()

In [0]:
def test_valid_trip_distance():
    invalids = check_value_range(df, "trip_distance", 0.01, 500)
    assert invalids.count() == 0, "Found invalid trip_distance values"
    print("test_valid_trip_distance passed successfully!")
test_valid_trip_distance()

In [0]:
def test_fare_amount_non_negative():
    invalids = check_value_range(df, "fare_amount", 0, float('inf'))
    assert invalids.count() == 0, "Found negative fare_amount"
    print("test_fare_amount_non_negative passed successfully!")
test_fare_amount_non_negative()

In [0]:
def test_minimum_row_count():
    assert count_rows(df) > 0, "Silver table is empty!"
    print("test_minimum_row_count passed successfully!")
test_minimum_row_count()