# Perkenalan

Nama : Gieorgie Kharismatik Kosasih 

Notebook ini dibuat untuk melakukan validasi data menggunakan great expectation. Data yang digunakan adalah data yang sudah dinormalisasi pada proses pipeline menggunakan Apache Airflow.

# Setting Great Expectation

In [1]:
!pip install -q great_expectations

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25h

# Create Data Context

In [2]:
# import library
from great_expectations.data_context import FileDataContext

In [3]:
# Create data context in current directory
context = FileDataContext.create(project_root_dir='./')

# Connect to a Datasource

In [4]:
# Give a name to a Datasource. This name must be unique between Datasources.
ds_name = 'cleaned_csv'
datasource = context.sources.add_pandas(ds_name)

# Give a name to a data asset
asset_name = 'retail_sales'
# create file path of the cleanded data set on csv
file_path = 'P2M3_gieorgie_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=file_path)

# Build batch request
batch_request = asset.build_batch_request()

# Create Expectation Suite

In [5]:
# Creat an expectation suite
expectation_suite_name = 'expectation-retail-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name)

# Check the validator content
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,invoice_id,branch,city,customer_type,gender,product_line,unit_price,quantity,tax,total,date,time,payment,cogs,gross_margin_percentage,gross_income,rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,2019-01-05,13,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,2019-03-08,10,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,2019-03-03,13,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,2019-01-27,20,Ewallet,465.76,4.761905,23.288,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2019-02-08,10,Ewallet,604.17,4.761905,30.2085,5.3


## Expectation 1

In [6]:
# Expectation 1: Kolom 'invoice_id' harus unik
validator.expect_column_values_to_be_unique("invoice_id")




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_unique",
    "kwargs": {
      "column": "invoice_id",
      "batch_id": "cleaned_csv-retail_sales"
    },
    "meta": {}
  },
  "result": {
    "element_count": 1000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## Expectation 2

In [7]:
# Expectation 2: Nilai 'rating' harus berada antara 0 dan 10
validator.expect_column_values_to_be_between("rating", min_value=0, max_value=10)




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "min_value": 0,
      "max_value": 10,
      "column": "rating",
      "batch_id": "cleaned_csv-retail_sales"
    },
    "meta": {}
  },
  "result": {
    "element_count": 1000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## Expectation 3

In [8]:
# Expectation 3: Nilai 'payment' harus dalam set ["Ewallet", "Cash", "Credit card"]
validator.expect_column_values_to_be_in_set("payment", ["Ewallet", "Cash", "Credit card"])




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_in_set",
    "kwargs": {
      "column": "payment",
      "value_set": [
        "Ewallet",
        "Cash",
        "Credit card"
      ],
      "batch_id": "cleaned_csv-retail_sales"
    },
    "meta": {}
  },
  "result": {
    "element_count": 1000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## Expectation 4

In [9]:
# Expectation 4: Nilai 'quantity' harus bertipe integer
validator.expect_column_values_to_be_of_type("quantity", "int64")




Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_of_type",
    "kwargs": {
      "column": "quantity",
      "type_": "int64",
      "batch_id": "cleaned_csv-retail_sales"
    },
    "meta": {}
  },
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## Expectation 5

In [12]:
# Expectation 5: Nilai 'date' harus memiliki format tanggal yang benar
validator.expect_column_values_to_match_strftime_format("date", "%Y-%m-%d")




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_match_strftime_format",
    "kwargs": {
      "column": "date",
      "strftime_format": "%Y-%m-%d",
      "batch_id": "cleaned_csv-retail_sales"
    },
    "meta": {}
  },
  "result": {
    "element_count": 1000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## Expectation 6

In [14]:
# Expectation 6: Nilai kolom 'total' harus lebih besar dari kolom 'cogs'
validator.expect_column_pair_values_a_to_be_greater_than_b("total", "cogs")




Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_pair_values_a_to_be_greater_than_b",
    "kwargs": {
      "column_A": "total",
      "column_B": "cogs",
      "batch_id": "cleaned_csv-retail_sales"
    },
    "meta": {}
  },
  "result": {
    "element_count": 1000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## Expectation 7

In [15]:
# Expectation 7: Jumlah kolom harus 1000
validator.expect_table_row_count_to_equal(1000)




Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_table_row_count_to_equal",
    "kwargs": {
      "value": 1000,
      "batch_id": "cleaned_csv-retail_sales"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 1000
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [18]:
# Save into Expectation Suite
validator.save_expectation_suite(discard_failed_expectations=False)

Data berhasil divalidasi