In [19]:
# # Install the library

!pip install -q great-expectations

In [20]:
# Create a data context

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

In [21]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'cleaned_shopping_csv'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'customer-shopping'
path_to_data = 'P2M3_kelvin_rizky_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [22]:
# Creat an expectation suite
expectation_suite_name = 'expectation-customer-shopping-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()




Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,unnamed:_0,customer_id,age,gender,item_purchased,category,purchase_amount,location,size,color,season,review_rating,subscription_status,shipping_type,discount_applied,promo_code_used,previous_purchases,payment_method,frequency_of_purchases
0,0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


# Expectation 1: Customer ID must be Unique

In [23]:
# Expectation 1 : Column `customer_id` must be unique

validator.expect_column_values_to_be_unique('customer_id')

  and should_run_async(code)




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_unique",
    "kwargs": {
      "column": "customer_id",
      "batch_id": "cleaned_shopping_csv-customer-shopping"
    },
    "meta": {}
  },
  "result": {
    "element_count": 3900,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# Expectation 2: The Standard Deviation column is between a certain range of values

In [24]:
# Expectation 2 : Column `category` can not contain missing values

validator.expect_column_stdev_to_be_between(column="purchase_amount", min_value=10, max_value=25)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_stdev_to_be_between",
    "kwargs": {
      "column": "purchase_amount",
      "min_value": 10,
      "max_value": 25,
      "batch_id": "cleaned_shopping_csv-customer-shopping"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 23.685392250875307
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# Expectation 3 : column data range should lie within a certain range of values

In [25]:
# Expectation 3 : Column `age` must between 18 and 70

validator.expect_column_values_to_be_between(
    column='age', min_value=18, max_value=70
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "age",
      "min_value": 18,
      "max_value": 70,
      "batch_id": "cleaned_shopping_csv-customer-shopping"
    },
    "meta": {}
  },
  "result": {
    "element_count": 3900,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# Expectation 4: column gender must be in set Male and Female

In [26]:
# Expectation 4 : Column `gender` must be in set ['Male', 'Female']

validator.expect_column_values_to_be_in_set('gender', ['Male', 'Female'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  common = np.find_common_type([values.dtype, comps_array.dtype], [])



{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_in_set",
    "kwargs": {
      "column": "gender",
      "value_set": [
        "Male",
        "Female"
      ],
      "batch_id": "cleaned_shopping_csv-customer-shopping"
    },
    "meta": {}
  },
  "result": {
    "element_count": 3900,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# Expectation 5: Column Number Integer must be in Integer

In [27]:
# Expectation 5  : Column `purchase_amount` in type list [int32, int64]

validator.expect_column_values_to_be_in_type_list('purchase_amount', ['int32', 'int64'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_in_type_list",
    "kwargs": {
      "column": "purchase_amount",
      "type_list": [
        "int32",
        "int64"
      ],
      "batch_id": "cleaned_shopping_csv-customer-shopping"
    },
    "meta": {}
  },
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# Expectation 6: The median value of a column data should be in within a certain range

In [28]:
# Expectation 6 : The median value of column `review_rating` must be between 3.6 and 3.8

validator.expect_column_median_to_be_between('review_rating', 3.6, 3.8)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_median_to_be_between",
    "kwargs": {
      "column": "review_rating",
      "min_value": 3.6,
      "max_value": 3.8,
      "batch_id": "cleaned_shopping_csv-customer-shopping"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 3.7
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# Expectation 7: Number of dataset is 3900

In [29]:
# Expectation 7: Number of dataset is 3900

validator.expect_table_row_count_to_equal(3900)

  and should_run_async(code)




Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_table_row_count_to_equal",
    "kwargs": {
      "value": 3900,
      "batch_id": "cleaned_shopping_csv-customer-shopping"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 3900
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# Conclusion

* The dataset passed all of the validation tests. The output of each validation test shows "success": true.