In [1]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
import great_expectations as gx

In [3]:
context = gx.get_context(project_root_dir="../services", mode="file")

# Create a datastore

In [4]:
ds = context.sources.add_or_update_pandas(name="transformed_sample")
ds

PandasDatasource(type='pandas', name='transformed_sample', id=None, assets=[])

## Load data

In [None]:
from src.data import extract_data, preprocess_data
from src.utils import init_hydra

cfg = init_hydra('main')
df, _ = extract_data("v1.0", cfg)
X, y = preprocess_data(cfg, df)


In [6]:
X.columns

Index(['FlightDate', 'Airline', 'Origin', 'Dest', 'Diverted', 'CRSDepTime',
       'CRSElapsedTime', 'Distance', 'Year', 'Quarter', 'Month', 'DayofMonth',
       'DayOfWeek', 'Marketing_Airline_Network',
       'Operated_or_Branded_Code_Share_Partners', 'DOT_ID_Marketing_Airline',
       'IATA_Code_Marketing_Airline', 'Flight_Number_Marketing_Airline',
       'Operating_Airline', 'DOT_ID_Operating_Airline',
       'IATA_Code_Operating_Airline', 'Flight_Number_Operating_Airline',
       'OriginAirportID', 'OriginAirportSeqID', 'OriginCityMarketID',
       'OriginCityName', 'OriginState', 'OriginStateFips', 'OriginStateName',
       'OriginWac', 'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID',
       'DestCityName', 'DestState', 'DestStateFips', 'DestStateName',
       'DestWac', 'DepTimeBlk', 'CRSArrTime', 'ArrTimeBlk', 'DistanceGroup',
       'DivAirportLandings', 'Month_sin', 'Month_cos', 'DayOfWeek_sin',
       'DayOfWeek_cos', 'DayofMonth_sin', 'DayofMonth_cos', 'Quarter_sin

## Add dataframe assets

In [7]:
ds_X = ds.add_dataframe_asset(name="X", dataframe=X)
ds_y = ds.add_dataframe_asset(name="y", dataframe=y)
ds

[1;35mSaving 2 Fluent Datasources to /home/sv-cheats-1/Documents/PROJECTS/MLops-project/notebooks/../services/gx/great_expectations.yml[0m
[1;35mCSVAsset.dict() - missing [0m[1;36mconfig_provider[1;35m, skipping config substitution[0m
[1;35mPandasDatasource.dict() - missing [0m[1;36mconfig_provider[1;35m, skipping config substitution[0m
[1;35mDataFrameAsset.dict() - missing [0m[1;36mconfig_provider[1;35m, skipping config substitution[0m
[1;35mPandasDatasource.dict() - missing [0m[1;36mconfig_provider[1;35m, skipping config substitution[0m
[1;35mSaving 2 Fluent Datasources to /home/sv-cheats-1/Documents/PROJECTS/MLops-project/notebooks/../services/gx/great_expectations.yml[0m
[1;35mCSVAsset.dict() - missing [0m[1;36mconfig_provider[1;35m, skipping config substitution[0m
[1;35mPandasDatasource.dict() - missing [0m[1;36mconfig_provider[1;35m, skipping config substitution[0m
[1;35mDataFrameAsset.dict() - missing [0m[1;36mconfig_provider[1;35m, skipping

PandasDatasource(type='pandas', name='transformed_sample', id=None, assets=[DataFrameAsset(name='X', type='dataframe', id=None, order_by=[], batch_metadata={}), DataFrameAsset(name='y', type='dataframe', id=None, order_by=[], batch_metadata={})])

# Create a suite


In [8]:
context.add_or_update_expectation_suite("feature_transform")

{
  "expectation_suite_name": "feature_transform",
  "ge_cloud_id": null,
  "expectations": [],
  "data_asset_type": null,
  "meta": {
    "great_expectations_version": "0.18.18"
  }
}

# Define the validator

In [9]:
validator = context.get_validator(
    batch_request=ds_X.build_batch_request(X),
    expectation_suite_name="feature_transform",
)
validator.columns()

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

['FlightDate',
 'Airline',
 'Origin',
 'Dest',
 'Diverted',
 'CRSDepTime',
 'CRSElapsedTime',
 'Distance',
 'Year',
 'Quarter',
 'Month',
 'DayofMonth',
 'DayOfWeek',
 'Marketing_Airline_Network',
 'Operated_or_Branded_Code_Share_Partners',
 'DOT_ID_Marketing_Airline',
 'IATA_Code_Marketing_Airline',
 'Flight_Number_Marketing_Airline',
 'Operating_Airline',
 'DOT_ID_Operating_Airline',
 'IATA_Code_Operating_Airline',
 'Flight_Number_Operating_Airline',
 'OriginAirportID',
 'OriginAirportSeqID',
 'OriginCityMarketID',
 'OriginCityName',
 'OriginState',
 'OriginStateFips',
 'OriginStateName',
 'OriginWac',
 'DestAirportID',
 'DestAirportSeqID',
 'DestCityMarketID',
 'DestCityName',
 'DestState',
 'DestStateFips',
 'DestStateName',
 'DestWac',
 'DepTimeBlk',
 'CRSArrTime',
 'ArrTimeBlk',
 'DistanceGroup',
 'DivAirportLandings',
 'Month_sin',
 'Month_cos',
 'DayOfWeek_sin',
 'DayOfWeek_cos',
 'DayofMonth_sin',
 'DayofMonth_cos',
 'Quarter_sin',
 'Quarter_cos',
 'CRSArrTime_HH',
 'CRSArrTim

### 1. Check that no null values exist

In [None]:
for column in validator.columns():
    validator.expect_column_values_to_not_be_null(column=column)

### 2. Check that columns that were hashed are hashed properly

In [11]:
from omegaconf import DictConfig, OmegaConf
from hydra import initialize, compose


for column in X.columns[X.dtypes == 'object']:
    validator.expect_column_values_to_be_between(
        column=column,
        min_value=0,
        max_value=999,
)
    
# Adds no expectations, since it is not known what columns were hashed

### 3. Check that datatypes are correct (everything is a number) 

In [None]:
for column in validator.columns():
    validator.expect_column_values_to_be_of_type(column=column, type_=str(X[column].dtype))

### 4. Check that cyclic data in encoded correctly

In [None]:
for column in filter(lambda c: c.endswith('cos') or c.endswith('sin'), validator.columns()):
    validator.expect_column_values_to_be_between(
        column=column,
        min_value=-1,
        max_value=1,
)

In [14]:
print("Total number of expectations:", len(validator.expectation_suite.expectations))

Total number of expectations: 158


In [15]:
# Save expectations and checkpoint
validator.save_expectation_suite(discard_failed_expectations=False)
checkpoint = context.add_or_update_checkpoint(
    name="transformations_checkpoint",
    validator=validator,
)

[1;35m	158 expectation(s) included in expectation_suite.[0m


In [16]:
checkpoint_result = checkpoint.run()

# Open report in browser
context.view_validation_result(checkpoint_result)

if checkpoint_result.success:
    print("Data quality verification passed successfully")
else:
    print("Data quality verification failed")

[1;35m	158 expectation(s) included in expectation_suite.[0m


Calculating Metrics:   0%|          | 0/448 [00:00<?, ?it/s]

Data quality verification passed successfully
