# Tool for unit testing of the data
- TL;DR: The great expectations (gx) is a library for unit testing of the data.
- You set some expectations about the data and then you run tests about the data to detect any anomalies, changes etc.

Simple example would be:
- categorical variable like food: should be in the list of ['apple', 'banana', 'orange'], but suddenly you have 'kiwi' in the data.

More complex example:
- you have a column where your mean and std should be in some range
- feature importance of your model should be ordered by X, Y, Z but suddenly you have Z, X, Y? Why would model change feature importance?

In [1]:
import great_expectations as gx
import pandas as pd

In [2]:
context = gx.get_context()

In [None]:
base_data = {
    "raw_data_id": [1, 2, 3, 4, 5],
    "timestamp": pd.date_range(start="2023-10-01", periods=5, freq="H"),
    "price": [100, 101, 102, 103, 104],
    "volume": [1000, 950, 900, 850, 800]
}

def load_data(data_type) -> pd.DataFrame:
    assert data_type in ["good", "bad"], f"Invalid data type: {data_type}"
    good_price = [100, 101, 102, 103, 104]
    bad_price = [100, 101, 102, 103, 110]
    base_data_copy = base_data.copy()
    base_data_copy["price"] = good_price if data_type == "good" else bad_price
    return pd.DataFrame(base_data_copy)

In [3]:
good_data = load_data("good")
bad_data = load_data("bad")

In [4]:
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

# Create expectations
dataset.expect_column_values_to_not_be_null("name")
dataset.expect_column_values_to_be_between("age", min_value=18, max_value=100)
dataset.expect_column_values_to_match_regex("email", r"^[^@]+@[^@]+\.[^@]+$")

# Save expectations to a binary file
expectations = dataset.get_expectation_suite()
context.save_expectation_suite(expectations, "my_expectations")


Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code_id,store_and_fwd_flag,pickup_location_id,dropoff_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2019-01-15 03:36:12,2019-01-15 03:42:19,1,1.00,1,N,230,48,1,6.5,0.5,0.5,1.95,0.0,0.3,9.75,
1,1,2019-01-25 18:20:32,2019-01-25 18:26:55,1,0.80,1,N,112,112,1,6.0,1.0,0.5,1.55,0.0,0.3,9.35,0.0
2,1,2019-01-05 06:47:31,2019-01-05 06:52:19,1,1.10,1,N,107,4,2,6.0,0.0,0.5,0.00,0.0,0.3,6.80,
3,1,2019-01-09 15:08:02,2019-01-09 15:20:17,1,2.50,1,N,143,158,1,11.0,0.0,0.5,3.00,0.0,0.3,14.80,
4,1,2019-01-25 18:49:51,2019-01-25 18:56:44,1,0.80,1,N,246,90,1,6.5,1.0,0.5,1.65,0.0,0.3,9.95,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2,2019-01-02 07:48:44,2019-01-02 08:00:13,6,1.07,1,N,50,161,2,8.5,0.0,0.5,0.00,0.0,0.3,9.30,
9996,2,2019-01-16 19:06:45,2019-01-16 19:10:05,6,0.35,1,N,234,234,1,4.0,1.0,0.5,1.16,0.0,0.3,6.96,
9997,2,2019-01-02 09:10:44,2019-01-02 09:36:46,6,4.12,1,N,50,236,1,20.0,0.0,0.5,6.24,0.0,0.3,27.04,
9998,2,2019-01-03 13:28:36,2019-01-03 13:36:42,6,1.17,1,N,137,234,1,7.0,0.0,0.5,0.90,0.0,0.3,8.70,


In [5]:
df["payment_type"].unique()

array([1, 2, 4, 3])

In [6]:
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df})

In [7]:
expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="passenger_count", min_value=1, max_value=6
)

In [8]:
validation_result = batch.validate(expectation)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

In [4]:
import great_expectations as gx

context = gx.get_context()

# Create an Expectation Suite
suite_name = "my_expectation_suite"
suite = gx.ExpectationSuite(name=suite_name)

# Add the Expectation Suite to the Data Context
suite = context.suites.add(suite)

# Create an Expectation to put into an Expectation Suite
expectation = gx.expectations.ExpectColumnValuesToNotBeNull(column="passenger_count")

# Add the previously created Expectation to the Expectation Suite
suite.add_expectation(expectation)

# Add another Expectation to the Expectation Suite.
suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="pickup_datetime")
)

# Update the configuration of an Expectation, then push the changes to the Expectation Suite
expectation.column = "pickup_location_id"
expectation.save()

# Retrieve an Expectation Suite from the Data Context
existing_suite_name = (
    "my_expectation_suite"  # replace this with the name of your Expectation Suite
)
suite = context.suites.get(name=existing_suite_name)
suite.save()

# Assume the company onboarded 8 and 10 people vans

In [10]:
# updating random 500 random rows to have passenger_count between 8 and 10
inidices_to_update = df.sample(500).index


df.loc[inidices_to_update[:250], "passenger_count"] = 8
df.loc[inidices_to_update[250:375], "passenger_count"] = 9
df.loc[inidices_to_update[375:], "passenger_count"] = 10

In [11]:
validation_result = batch.validate(expectation)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

# Failing validation result
- success: false
- we get a % of offending rows
- What values are offending

In [12]:
validation_result.to_json_dict()

{'success': False,
 'expectation_config': {'type': 'expect_column_values_to_be_between',
  'kwargs': {'batch_id': 'pandas-pd dataframe asset',
   'column': 'passenger_count',
   'min_value': 1.0,
   'max_value': 6.0},
  'meta': {}},
 'result': {'element_count': 10000,
  'unexpected_count': 500,
  'unexpected_percent': 5.0,
  'partial_unexpected_list': [10,
   8,
   10,
   8,
   8,
   8,
   8,
   9,
   8,
   10,
   8,
   10,
   8,
   8,
   8,
   8,
   8,
   9,
   8,
   8],
  'missing_count': 0,
  'missing_percent': 0.0,
  'unexpected_percent_total': 5.0,
  'unexpected_percent_nonmissing': 5.0,
  'partial_unexpected_counts': [{'value': 8, 'count': 14},
   {'value': 10, 'count': 4},
   {'value': 9, 'count': 2}],
  'partial_unexpected_index_list': [12,
   14,
   103,
   105,
   118,
   122,
   152,
   163,
   168,
   185,
   192,
   213,
   236,
   247,
   249,
   294,
   304,
   315,
   328,
   333]},
 'meta': {},
 'exception_info': {'raised_exception': False,
  'exception_traceback': Non