In [1]:
import datetime as dt
from pathlib import Path
from urllib.request import urlretrieve

import geopandas as gpd
import pandas as pd

Great Expectations is an open-source Python-based library that brings the idea of "testing" to your data. It enables you to define expectations for properties of your datasets (like records per batch, distribution of values in a column, columns in a table, etc) and check that the data meets those expectations when the data is updated.

## Sample Data Collection and Preparation

In [2]:
POST_DIR = Path(".").resolve()
POST_DATA_DIR = POST_DIR.joinpath("data")
POST_DATA_DIR.mkdir(exist_ok=True)

In [3]:
url = "https://data.cityofchicago.org/api/geospatial/4ijn-s7e5?method=export&format=GeoJSON"
full_file_path = POST_DATA_DIR.joinpath("full_food_inspections.geojson")
if not full_file_path.is_file():
    urlretrieve(url=url, filename=full_file_path)
full_food_inspection_gdf = gpd.read_file(full_file_path)

In [4]:
print(full_food_inspection_gdf.shape)
full_food_inspection_gdf.head(2)

(255573, 21)


Unnamed: 0,location_state,facility_type,city,location_zip,inspection_id,license_,latitude,zip,state,location_address,...,aka_name,risk,longitude,dba_name,inspection_date,results,inspection_type,address,violations,geometry
0,,Restaurant,CHICAGO,,2577546,2808408,41.79300598548857,60638,IL,,...,CIAO RAGAZZI,Risk 1 (High),-87.78197456162096,CIAO RAGAZZI,2023-06-20,Pass w/ Conditions,Canvass,5440 S NARRAGANSETT AVE,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,POINT (-87.78197 41.79301)
1,,Restaurant,CHICAGO,,2577553,2078887,41.83100808816173,60616,IL,,...,CHI SOX BAR & GRILL,Risk 1 (High),-87.63493248572952,CHISOX BAR & GRILL,2023-06-20,Pass,Canvass,320 W 35TH ST,51. PLUMBING INSTALLED; PROPER BACKFLOW DEVICE...,POINT (-87.63493 41.83101)


For some reason, Socrata adds on these four always-null location columns on to geospatial exports. I'm going to remove them.

In [5]:
location_cols = ["location_state", "location_zip", "location_address", "location_city"]
print("Rows with a non-null value in these location_xxx columns:")
display(full_food_inspection_gdf[location_cols].notnull().sum())
full_food_inspection_gdf = full_food_inspection_gdf.drop(columns=location_cols)

Rows with a non-null value in these location_xxx columns:


location_state      0
location_zip        0
location_address    0
location_city       0
dtype: int64

That column ordering is a bit chaotic, so I'll reorder them (for readability).

In [6]:
col_order = [
    "inspection_id", "inspection_date", "dba_name", "aka_name", "license_", "facility_type",
    "risk", "inspection_type", "results", "address", "city", "state", "zip", "violations",
    "longitude", "latitude", "geometry"
]
full_food_inspection_gdf = full_food_inspection_gdf[col_order].copy()

I also want to break this into batches based on the dates, so I need to cast the `inspection_date` to a datetime type.

In [7]:
full_food_inspection_gdf["inspection_date"] = pd.to_datetime(
    full_food_inspection_gdf["inspection_date"]
)

In [8]:
full_food_inspection_gdf.head(2)

Unnamed: 0,inspection_id,inspection_date,dba_name,aka_name,license_,facility_type,risk,inspection_type,results,address,city,state,zip,violations,longitude,latitude,geometry
0,2577546,2023-06-20,CIAO RAGAZZI,CIAO RAGAZZI,2808408,Restaurant,Risk 1 (High),Canvass,Pass w/ Conditions,5440 S NARRAGANSETT AVE,CHICAGO,IL,60638,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,-87.78197456162096,41.79300598548857,POINT (-87.78197 41.79301)
1,2577553,2023-06-20,CHISOX BAR & GRILL,CHI SOX BAR & GRILL,2078887,Restaurant,Risk 1 (High),Canvass,Pass,320 W 35TH ST,CHICAGO,IL,60616,51. PLUMBING INSTALLED; PROPER BACKFLOW DEVICE...,-87.63493248572952,41.83100808816173,POINT (-87.63493 41.83101)


In [9]:
month_start_dates = full_food_inspection_gdf["inspection_date"].dt.to_period("M").dt.to_timestamp().unique()
month_start_dates.sort()

In [10]:
data_file_names = [dp.name for dp in POST_DATA_DIR.iterdir()]
data_file_names.sort()
data_file_names[0:5]

['full_food_inspections.geojson']

In [11]:
for month_start_date in month_start_dates:
    batch_period = pd.to_datetime(month_start_date).strftime("%Y_%m")
    batch_data = full_food_inspection_gdf.loc[
        full_food_inspection_gdf["inspection_date"].between(
            left=month_start_date,
            right=month_start_date + pd.DateOffset(months=1),
            inclusive="left")
    ].copy()
    batch_file_path = POST_DATA_DIR.joinpath(f"food_inspection_batch_{batch_period}.parquet")
    if not batch_file_path.is_file():
        batch_data.to_parquet(batch_file_path)

In [12]:
data_file_names = [dp.name for dp in POST_DATA_DIR.iterdir()]
data_file_names.sort()
data_file_names[0:5]

['food_inspection_batch_2010_01.parquet',
 'food_inspection_batch_2010_02.parquet',
 'food_inspection_batch_2010_03.parquet',
 'food_inspection_batch_2010_04.parquet',
 'food_inspection_batch_2010_05.parquet']

## Great Expectations Setup

First, you'll need to install the `great_expectations`. If you already have `conda` installed on your machine, you can easily set up a conda env just like the one used to run this notebook by:
1. copying the `gx_env_environment.yml` file in the same dir as this notebook file to your machine,
2. open a terminal and navigate to the dir with that new file, and
3. run command `conda env create -f environment.yml`

## Create or Load Great Expectations Data Context

In [13]:
import great_expectations as gx
from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir=POST_DIR)

## Create a Datasource

In [17]:
datasource_name = "food_inspection_datasource"

if any(el["name"] == datasource_name for el in context.list_datasources()):
    print(f"Datasource with name '{datasource_name}' found; loading now")
    datasource = context.get_datasource(datasource_name)
else:
    print(f"No Datasource with name '{datasource_name}' found; creating now")
    datasource = context.sources.add_pandas_filesystem(
        name=datasource_name,
        base_directory=POST_DATA_DIR
    )

Datasource with name 'food_inspection_datasource' found; loading now


True

In [20]:
[el for el in dir(datasource) if el.startswith("add_")]

['add_csv_asset',
 'add_excel_asset',
 'add_feather_asset',
 'add_fwf_asset',
 'add_hdf_asset',
 'add_html_asset',
 'add_json_asset',
 'add_orc_asset',
 'add_parquet_asset',
 'add_pickle_asset',
 'add_sas_asset',
 'add_spss_asset',
 'add_stata_asset',
 'add_xml_asset']

In [29]:
data_asset_name = "food_inspections_asset"

if data_asset_name not in datasource.get_asset_names():
    print(f"Creating data asset {data_asset_name}")
    data_asset = datasource.add_parquet_asset(
        name=data_asset_name,
        batching_regex = r"food_inspection_batch_(?P<year>\d{4})_(?P<month>\d{2})\.parquet"
    )
else:
    data_asset = datasource.get_asset(data_asset_name)

Creating data asset food_inspections_asset


I'll also sort these batches.

In [30]:
data_asset = data_asset.add_sorters(["+year", "+month"])

In [31]:
batch_request = data_asset.build_batch_request()
batches = data_asset.get_batch_list_from_batch_request(batch_request)

## Using the profiler to create basic expectations

In [35]:
expectation_suite_name = "food_inspections_suite"

expectation_suite = context.add_or_update_expectation_suite(
    expectation_suite_name=expectation_suite_name
)

In [36]:
data_assistant_result = context.assistants.onboarding.run(
    batch_request=batch_request,
    exclude_column_names=[],
)




Generating Expectations:   0%|          | 0/8 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/162 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/324 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/324 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/162 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5670 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/0 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/162 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5670 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/0 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/162 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/5670 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/5 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/810 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/162 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/810 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/810 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/810 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/162 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/810 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/810 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/810 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/162 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/810 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/810 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/810 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/162 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/810 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/810 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/810 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/162 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/810 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/810 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/0 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/15 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/972 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3726 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/972 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3726 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/972 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3726 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/972 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3726 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/972 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3726 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/972 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3726 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/972 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3726 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/972 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3726 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/972 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3726 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/972 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3726 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/972 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3726 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/972 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3726 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/972 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3726 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/972 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3726 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/972 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3726 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/3078 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1134 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1134 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1134 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1134 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1134 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1134 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1134 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/648 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1134 [00:00<?, ?it/s]

In [47]:
data_assistant_result.plot_expectations_and_metrics(exclude_column_names=["inspection_date"])

62 Expectations produced, 9 Expectation and Metric plots implemented
Use DataAssistantResult.show_expectations_by_domain_type() or
DataAssistantResult.show_expectations_by_expectation_type() to show all produced Expectations


interactive(children=(Dropdown(description='Select Plot Type: ', layout=Layout(margin='0px', width='max-conten…



In [40]:
expectation_suite = data_assistant_result.get_expectation_suite(
    expectation_suite_name=expectation_suite_name
)

In [48]:
# expectation_suite.show_expectations_by_expectation_type()

In [50]:
saved_suite = context.add_or_update_expectation_suite(expectation_suite=expectation_suite)