===============================================================================================

**Supermarket Sales Analysis Validation File**

This program aims to validate the dataset through testing if it passes the eight great expectations. 

===============================================================================================

In [1]:
# import library
from great_expectations.data_context import FileDataContext

In [2]:
# Create data context in current directory
context = FileDataContext.create(project_root_dir='./')

In [4]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-supermarket-saless'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'supermarket-sales'
path_to_data = 'P2M3_hafiz_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [5]:
# Creat an expectation suite
expectation_suite_cleandata_name = 'expectation-sales-dataset'
context.add_or_update_expectation_suite(expectation_suite_cleandata_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_cleandata_name)

# Check the validator content
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,invoice_id,branch,city,customer_type,gender,product_line,unit_price,quantity,tax_5%,total,date,time,payment,cogs,gross_margin_percentage,gross_income,rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,3/8/2019,10:29,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,1/27/2019,20:33,Ewallet,465.76,4.761905,23.288,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3


In [6]:
# Expectation 1 : The maximum value of column `rating` must be between 5.3 and 10.00

validator.expect_column_max_to_be_between('rating', 5.3, 10.00)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "meta": {},
  "result": {
    "observed_value": 10.0
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true
}

In [7]:
# Expectation 2 : Column `invoice_id` must be unique

validator.expect_column_values_to_be_unique('invoice_id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "meta": {},
  "result": {
    "element_count": 1000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true
}

In [8]:
# Expectation 3 : Column `payment` must contain one of these : 'Cash', 'Ewallet', 'Credit card'
validator.expect_column_values_to_be_in_set('payment', ['Cash', 'Ewallet', 'Credit card'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "meta": {},
  "result": {
    "element_count": 1000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true
}

In [9]:
# Expectation 4 : Column `gross_incomet` must in form of float

validator.expect_column_values_to_be_in_type_list('gross_income', ['float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "meta": {},
  "result": {
    "observed_value": "float64"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true
}

In [10]:
# Expectation 5 : The maximum value of column `quantity` must be between 1 and 9

validator.expect_column_min_to_be_between('quantity', 1, 9)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "meta": {},
  "result": {
    "observed_value": 1
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true
}

In [11]:
# Expectation 6 : Column `city = Yangon` should have 1/4 of total data 
validator.expect_column_values_to_match_regex(column='city', regex='Yangon', mostly=0.25)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "meta": {},
  "result": {
    "element_count": 1000,
    "unexpected_count": 660,
    "unexpected_percent": 66.0,
    "partial_unexpected_list": [
      "Naypyitaw",
      "Naypyitaw",
      "Naypyitaw",
      "Mandalay",
      "Mandalay",
      "Mandalay",
      "Mandalay",
      "Mandalay",
      "Naypyitaw",
      "Mandalay",
      "Mandalay",
      "Mandalay",
      "Mandalay",
      "Mandalay",
      "Mandalay",
      "Mandalay",
      "Naypyitaw",
      "Naypyitaw",
      "Naypyitaw",
      "Mandalay"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 66.0,
    "unexpected_percent_nonmissing": 66.0
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true
}

In [12]:
# Expectation 7 : The Median of "unit_price" must be in range 10-100
validator.expect_column_median_to_be_between('unit_price', 10,100)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "meta": {},
  "result": {
    "observed_value": 55.230000000000004
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true
}

In [13]:
#Expectation 8 : The Row of Table must be 1000
validator.expect_table_row_count_to_equal(1000)

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "meta": {},
  "result": {
    "observed_value": 1000
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true
}

In [14]:
# Save all validations into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

# Conclusion

* The dataset passed all of the validation tests. The output of each validation test shows "success": true.