In [62]:
import great_expectations as gx

In [63]:
import numpy as np

def load_ubyte_images(filename):
    """
    Returns the content of a ubyte file parsed as int matrices.
    """
    with open(filename, 'rb') as f:
        # Skip the header (first 16 bytes for images)
        f.read(16)
        # Read the rest as a numpy array, reshape to 28x28 per image
        data = np.frombuffer(f.read(), dtype=np.uint8).reshape(-1, 28, 28)
    return data

def load_ubyte_labels(filename):
    """
    Returns the content of a ubyte file parsed as int scalars.
    """
    with open(filename, 'rb') as f:
        # Skip the header (first 8 bytes for labels)
        f.read(8)
        # Read the rest as a numpy array
        labels = np.frombuffer(f.read(), dtype=np.uint8)
    return labels

In [64]:
train_images = load_ubyte_images(
    '../data/interim/unzipped/train-images-idx3-ubyte/train-images-idx3-ubyte'
)
train_labels = load_ubyte_labels(
    '../data/interim/unzipped/train-labels-idx1-ubyte/train-labels-idx1-ubyte'
)
test_images = load_ubyte_images(
    '../data/interim/unzipped/t10k-images-idx3-ubyte/t10k-images-idx3-ubyte'
)
test_labels = load_ubyte_labels(
    '../data/interim/unzipped/t10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte'
)

In [65]:
labels = [*train_labels, *test_labels]
images = [*train_images, *test_images]

In [66]:
import pandas as pd

train_set = pd.DataFrame({
        'image_shapes': [img.shape for img in train_images],
        'labels': train_labels
    }
)

test_set = pd.DataFrame({
        'image_shapes': [img.shape for img in test_labels],
        'labels': test_labels
    }
)

In [67]:
context = gx.get_context()

In [68]:
data_source = context.data_sources.add_pandas('dataset')

In [69]:
train_asset = data_source.add_dataframe_asset(name='train set')
test_asset = data_source.add_dataframe_asset(name='test set')

In [70]:
train_batch_def = train_asset.add_batch_definition_whole_dataframe('train batch')
test_batch_def = test_asset.add_batch_definition_whole_dataframe('test batch')

In [71]:
train_batch = train_batch_def.get_batch(
    batch_parameters={
        'dataframe':  train_set
    }
)

test_batch = test_batch_def.get_batch(
    batch_parameters={
        'dataframe':  test_set
    }
)

In [72]:
image_shape_expectation = gx.expectations.ExpectColumnValuesToBeInSet(
    column='image_shapes',
    value_set={(28, 28)}
)

label_value_expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column='labels',
    min_value=0,
    max_value=9
)

In [73]:
image_shape_res = train_batch.validate(image_shape_expectation)
validation_shape_res = train_batch.validate(label_value_expectation)

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
def f(data_source, images, labels, asset_name):
    df = pd.DataFrame({
            'image_shapes': [img.shape for img in images],
            'labels': labels
        }
    )

    asset = data_source.add_dataframe_asset(name=asset_name)
    batch_def = asset.add_batch_definition_whole_dataframe(f'{asset_name} batch')

    batch = batch_def.get_batch(
        batch_parameters={
                'dataframe': df
            }
    )

    return batch

def g(batch, expectation, res):
    res.append(batch.validate(expectation))

context = gx.get_context()

image_shape_expectation = gx.expectations.ExpectColumnValuesToBeInSet(
    column='image_shapes',
    value_set={(28, 28)}
)

label_value_expectation = gx.expectations.ExpectColumnValuesToBeInSet(
    column='labels',
    value_set=range(10)
)

data_source = context.data_sources.add_pandas('dataset')

train_batch = f(data_source, train_images, train_labels, 'train set')
test_batch = f(data_source, test_images, test_labels, 'test set')

res = []

g(train_batch, image_shape_expectation, res)
g(train_batch, label_value_expectation, res)
g(test_batch, image_shape_expectation, res)
g(test_batch, label_value_expectation, res)

all([r.success for r in res])

  timestamp = datetime.utcnow().replace(tzinfo=tzutc())


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

True