# Great Expectations

#### 특정 데이터셋의 품질관리 및 데이터 파이프라인에서 데이터의 무결성과 일관성을 확인하는데 도움을 주는 데이터 검증 라이브러리

<img src="https://velog.velcdn.com/images/newnew_daddy/post/d510bf11-9c75-491b-b8ea-485866e838c2/image.png" width="40%">
<img src="https://velog.velcdn.com/images/newnew_daddy/post/c801020f-ce37-442a-bf82-3c0d20d485ea/image.png" width="45%">

### product.csv DATASET

>1. product DATASET의 row의 수가 500-1000 사이인가?
>2. 'ProductKey' 컬럼에 중복 값은 없을까?
>3. 'EnglishDescription' 컬럼에 Null값의 비율이 40% 이하일까?
>4. 'ReorderPoint' 컬럼의 데이터 타입이 int64 혹은 float64 형으로 구성되어 있을까?
>5. 'ProductSubcategoryKey' 컬럼의 값들의 범위가 0-40 사이인가?
>6. 'ProductLine' 컬럼이 가진 값들이 ['R', 'S', 'M', 'T'] 내에 존재할까?

#### >>> 이렇게 여러개의 조건에 대한 검증을 한 번에 수행할 수는 없을까?



#### 1. Python 함수를 통한 검증

In [None]:
import great_expectations as ge


In [None]:
import pandas as pd

df = pd.read_csv("../dataset/data-03/product.csv")

In [None]:
def validation(df):
    val = []
    length = len(df)
    com_list = set(['R', 'S', 'M', 'T'])
    if 500 <= length <= 1000:
        val.append(1)
    if length == len(df.ProductKey.unique()):
        val.append(2)
    if len(df[df.EnglishDescription.notna()]) / len(df) > 0.6:
        val.append(3)
    if str(df.dtypes.ReorderPoint) == 'int64' or str(df.dtypes.ReorderPoint) == 'float64':
        val.append(4)
    if len(com_list) == len(com_list.intersection(set(df.ProductLine.unique()))):
        val.append(5)
    if df.ProductSubcategoryKey.min() >= 0 and df.ProductSubcategoryKey.max() <= 40:
        val.append(6)
    print(val)
    if len(val) == 6:
        return True
    else:
        return False

In [None]:
# ge_df.expect_table_row_count_to_be_between(500, 1000)
# ge_df.expect_column_values_to_be_unique('ProductKey')
# ge_df.expect_column_values_to_not_be_null("EnglishDescription", mostly=0.60)
# ge_df.expect_column_values_to_be_in_type_list("ProductKey", ["int", "int64"])
# ge_df.expect_column_values_to_be_between('ProductSubcategoryKey', min_value=0, max_value=40)
# ge_df.expect_column_values_to_be_in_set("ProductLine", ['R', 'S', 'M', 'T'])

#### 2. Great Expectation Tool을 통한 검증

In [None]:
ge_df = ge.from_pandas(df)

In [None]:
type(ge_df)

### GE Data Quality Tests

In [None]:
# check number of rows in the dataset
ge_df.expect_table_row_count_to_equal(1000)

# ge_df.expect_table_row_count_to_be_between(1000, 50000)

### Primary Key Test

In [None]:
ge_df.expect_column_to_exist('ProductKey')

In [None]:
ge_df.expect_column_values_to_be_unique('ProductKey')

In [None]:
ge_df.expect_column_values_to_be_between('ProductSubcategoryKey', min_value=0, max_value=35, mostly=0.8)

In [None]:
ge_df.expect_column_values_to_be_in_type_list("ReorderPoint", ["float64", "int64"])

### Test values in a set (list)

In [None]:
df.ProductLine.unique()

In [None]:
ge_df.expect_column_values_to_be_in_set("ProductLine", ['R', 'S', 'M', 'T'])

In [None]:
ge_df.expect_column_values_to_be_in_set("Color", [ 'Black', 'Silver', 'Red', 'White', 'Blue', 'Multi', 'Yellow','Grey', 'Silver'])

### Check min and Max range of column

In [None]:
df.SafetyStockLevel.unique()

In [None]:
ge_df.expect_column_max_to_be_between("SafetyStockLevel", 1, 1000)

In [None]:
ge_df.expect_column_max_to_be_between("DaysToManufacture", 1, 10)

In [None]:
ge_df.expect_column_mean_to_be_between("StandardCost", 100, 500)

### Test Text columns (Nulls)

In [None]:
ge_df.expect_column_values_to_not_be_null('EnglishProductName')

In [None]:
ge_df.expect_column_values_to_not_be_null("Color", mostly=0.60)

### Save your test cases and re-use

In [None]:
ge_df.get_expectation_suite()

In [None]:
ge_df.save_expectation_suite("product_suite.json")

In [None]:
config = ge_df.get_expectation_suite()

In [None]:
df_true = ge.read_csv("product_true.csv")
df_error = ge.read_csv("product_error.csv")

In [None]:
df_true.validate(expectation_suite=config)

### Test with Config file

In [None]:
# test_results = df_error.validate(expectation_suite="product.data.expectations.json")
df_error.validate(expectation_suite="product_suite.json")