In [1]:
# Install the library

!pip install -q great-expectations

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m76.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Create a data context

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

In [3]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-data-ecommerce'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'ecommerce'
path_to_data = 'P2M3_nurul_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [4]:
# Creat an expectation suite
expectation_suite_name = 'expectation-ecommerce'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()




Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,order_date,row_id,order_id,ship_mode,customer_id,segment,country,city,state,postal_code,region,product_id,category,sub-category,product_name,sales,quantity,discount,profit
0,01-01-20,849,CA-2017-107503,Standard Class,GA-14725,Consumer,United States,Lorain,Ohio,44052,East,FUR-FU-10003878,Furniture,Furnishings,"Linden 10"" Round Wall Clock, Black",48.896,4,0.2,8.5568
1,01-01-20,4010,CA-2017-144463,Standard Class,SC-20725,Consumer,United States,Los Angeles,California,90036,West,FUR-FU-10001215,Furniture,Furnishings,"Howard Miller 11-1/2"" Diameter Brentwood Wall ...",474.43,11,0.0,199.2606
2,01-01-20,6683,CA-2017-154466,First Class,DP-13390,Home Office,United States,Franklin,Wisconsin,53132,Central,OFF-BI-10002012,Office Supplies,Binders,Wilson Jones Easy Flow II Sheet Lifters,3.6,2,0.0,1.728
3,01-01-20,8070,CA-2017-151750,Standard Class,JM-15250,Consumer,United States,Huntsville,Texas,77340,Central,OFF-ST-10002743,Office Supplies,Storage,SAFCO Boltless Steel Shelving,454.56,5,0.2,-107.958
4,01-01-20,8071,CA-2017-151750,Standard Class,JM-15250,Consumer,United States,Huntsville,Texas,77340,Central,FUR-FU-10002116,Furniture,Furnishings,"Tenex Carpeted, Granite-Look or Clear Contempo...",141.42,5,0.6,-187.3815


In [14]:
# Expectation 1 : Column `quantity` must in form of integer

validator.expect_column_values_to_be_of_type("quantity", "int")

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_of_type",
    "kwargs": {
      "column": "quantity",
      "type_": "int",
      "batch_id": "csv-data-ecommerce-ecommerce"
    },
    "meta": {}
  },
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [15]:
# Expectation 2 : Column `order_date` must be unique

validator.expect_column_values_to_be_unique('order_date')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_unique",
    "kwargs": {
      "column": "order_date",
      "batch_id": "csv-data-ecommerce-ecommerce"
    },
    "meta": {}
  },
  "result": {
    "element_count": 3312,
    "unexpected_count": 3296,
    "unexpected_percent": 99.51690821256038,
    "partial_unexpected_list": [
      "01-01-20",
      "01-01-20",
      "01-01-20",
      "01-01-20",
      "01-01-20",
      "01-01-20",
      "01-01-20",
      "01-01-20",
      "01-01-20",
      "01-01-20",
      "02-01-20",
      "02-01-20",
      "02-01-20",
      "02-01-20",
      "02-01-20",
      "02-01-20",
      "02-01-20",
      "02-01-20",
      "03-01-20",
      "03-01-20"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 99.51690821256038,
    "unexpected_percent_nonmissing": 99.51690821256038
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback

In [17]:
# Expectation 3 : Column `ship_mode` must contain one of the following 3 things :
# 1 = Standard Class
# 2 = First Class
# 3 = Second Class


validator.expect_column_values_to_be_in_set('ship_mode', [1, 2, 3])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_in_set",
    "kwargs": {
      "column": "ship_mode",
      "value_set": [
        1,
        2,
        3
      ],
      "batch_id": "csv-data-ecommerce-ecommerce"
    },
    "meta": {}
  },
  "result": {
    "element_count": 3312,
    "unexpected_count": 3312,
    "unexpected_percent": 100.0,
    "partial_unexpected_list": [
      "Standard Class",
      "Standard Class",
      "First Class",
      "Standard Class",
      "Standard Class",
      "Standard Class",
      "Standard Class",
      "Standard Class",
      "Standard Class",
      "Standard Class",
      "First Class",
      "First Class",
      "First Class",
      "Second Class",
      "Second Class",
      "Second Class",
      "Second Class",
      "Second Class",
      "Standard Class",
      "Standard Class"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 100.0,
    "unexpected_

In [18]:
# Expectation 4 : Column `quantity` must be less than 14.000000

validator.expect_column_values_to_be_between(
    column='quantity', min_value=0, max_value=14.000000
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "quantity",
      "min_value": 0,
      "max_value": 14.0,
      "batch_id": "csv-data-ecommerce-ecommerce"
    },
    "meta": {}
  },
  "result": {
    "element_count": 3312,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [19]:
# Expectation 5 : Column `order_date` must be match with strftime_format


validator.expect_column_values_to_match_strftime_format('order_date', "%m-%d-%y")




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_match_strftime_format",
    "kwargs": {
      "column": "order_date",
      "strftime_format": "%m-%d-%y",
      "batch_id": "csv-data-ecommerce-ecommerce"
    },
    "meta": {}
  },
  "result": {
    "element_count": 3312,
    "unexpected_count": 1990,
    "unexpected_percent": 60.08454106280193,
    "partial_unexpected_list": [
      "13-01-20",
      "13-01-20",
      "13-01-20",
      "13-01-20",
      "14-01-20",
      "14-01-20",
      "14-01-20",
      "14-01-20",
      "14-01-20",
      "14-01-20",
      "14-01-20",
      "14-01-20",
      "14-01-20",
      "14-01-20",
      "14-01-20",
      "14-01-20",
      "14-01-20",
      "14-01-20",
      "14-01-20",
      "14-01-20"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 60.08454106280193,
    "unexpected_percent_nonmissing": 60.08454106280193
  },
  "meta": {},
  "exception_info": {
    "r

In [30]:
# Set an expectation for the values in column 'category' to be the same as column 'sub-category'

validator.expect_column_pair_values_to_be_equal('category', 'sub-category')

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": false,
  "expectation_config": {
    "expectation_type": "expect_column_pair_values_to_be_equal",
    "kwargs": {
      "column_A": "category",
      "column_B": "sub-category",
      "batch_id": "csv-data-ecommerce-ecommerce"
    },
    "meta": {}
  },
  "result": {
    "element_count": 3312,
    "unexpected_count": 3312,
    "unexpected_percent": 100.0,
    "partial_unexpected_list": [
      [
        "Furniture",
        "Furnishings"
      ],
      [
        "Furniture",
        "Furnishings"
      ],
      [
        "Office Supplies",
        "Binders"
      ],
      [
        "Office Supplies",
        "Storage"
      ],
      [
        "Furniture",
        "Furnishings"
      ],
      [
        "Furniture",
        "Chairs"
      ],
      [
        "Office Supplies",
        "Art"
      ],
      [
        "Office Supplies",
        "Binders"
      ],
      [
        "Office Supplies",
        "Binders"
      ],
      [
        "Office Supplies",
        "Appliance

In [43]:
# Set an expectation for the standard deviation of the 'Sales' column to be the provided value

validator.expect_column_stdev_to_be_between(column='sales', value=585.257531, tolerance=0)



Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_stdev_to_be_between",
    "kwargs": {
      "column": "sales",
      "value": 585.257531,
      "tolerance": 0,
      "batch_id": "csv-data-ecommerce-ecommerce"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 585.2575312625233
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [44]:
# Save into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

In [45]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [47]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()




Calculating Metrics:   0%|          | 0/48 [00:00<?, ?it/s]

In [48]:
# Build data docs

context.build_data_docs()

{'local_site': 'file:///content/gx/uncommitted/data_docs/local_site/index.html'}