# Data Validation in Glue ETL Job

- Schema mismatch - field missing
- Schema mismatch - unknown field
- Schema mismatch - type error
- Schema mismatch - Not null value
- Value error - value range

Usecase, bank account transaction data:

```python
{
    "id": 1, # int
    "username": "Alice", # required, not null
    "balannce": 1000, # int, non negative,
}
```

In [1]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from marshmallow import fields, Schema, validates
from faker import Faker
from s3pathlib import S3Path
from rich import print as rprint


In [21]:
class Config:
    region = "us-east-2"
    bucket = "aws-data-lab-sanhe-for-everything-us-east-2"
    prefix = "poc/learn-big-data-on-aws/glue-job-examples/04-glue-job-best-practice/data-validation/bank_account/"
    database = "learn_big_data_on_aws"
    table = "data_validation_bank_account"
    
    @property
    def s3path_prefix(self) -> S3Path:
        return S3Path(self.bucket, self.prefix)
    
config = Config()

In [4]:
data1 = [
    {"id": 1, "name": "user1", "balance": 100},
    {"id": 2, "name": "user2", "balance": 200},
    {"id": 3, "name": "user3", "balance": 300},
    {"id": 4, "name": "user4", "balance": 400},
    {"id": 5, "name": "user5", "balance": 500},
    {"id": 6, "name": "user6", "balance": 600},
    {"id": 7, "name": "user7", "balance": 700},
    {"id": 8, "name": "user8", "balance": 800},
    {"id": 9, "name": "user9", "balance": 900},
    {"id": 10, "name": "user10", "balance": 1000},
]

data2 = [
    {"id": 11, "name": "user11", "balance": 1100},
    {"id": 12, "name": "user12", "balance": 1200},
    {"id": 13, "name": "user13", "balance": 1300},
    {"id": 14, "name": "user14", "balance": 1400},
    {"id": 15, "name": "user15", "balance": 1500, "password": "123456"}, # unknown field
    {"id": 16, "name": "user16", "balance": 1600},
    {"id": 17, "name": "user17", "balance": 1700},
    {"id": 18, "name": "user18", "balance": 1800},
    {"id": 19, "name": "user19", "balance": 1900},
    {"id": 20, "balance": 2000}, # field missing
]

data3 = [
    {"id": 21, "name": "user21", "balance": 2100},
    {"id": 22, "name": "user22", "balance": 2200},
    {"id": 23, "name": "user23", "balance": "2,300"}, # type error
    {"id": 24, "name": "user24", "balance": 2400},
    {"id": 25, "name": "user25", "balance": 2500},
    {"id": 26, "name": "user26", "balance": -2600}, # value range error
    {"id": 27, "name": "user27", "balance": 2700},
    {"id": 28, "name": "user28", "balance": 2800},
    {"id": 29, "name": "user29", "balance": 2900},
    {"id": 30, "name": None, "balance": 3000}, # not null value
]

def create_one(ith: int, data: list):
    print(f"processing {ith} th file ...")
    s3path = S3Path(config.s3path_prefix, f"{ith}.json")
    df = pd.DataFrame(data)
    with s3path.open("w") as f:
        df.to_json(f, orient="records", lines=True)
    print(f"  open for preview: {s3path.console_url}")

create_one(1, data1)
create_one(2, data2)
create_one(3, data3)

processing 1 th file ...
  open for preview: https://console.aws.amazon.com/s3/object/aws-data-lab-sanhe-for-everything-us-east-2?prefix=poc/learn-big-data-on-aws/glue-job-examples/04-glue-job-best-practice/data-validation/1.json
processing 2 th file ...
  open for preview: https://console.aws.amazon.com/s3/object/aws-data-lab-sanhe-for-everything-us-east-2?prefix=poc/learn-big-data-on-aws/glue-job-examples/04-glue-job-best-practice/data-validation/2.json
processing 3 th file ...
  open for preview: https://console.aws.amazon.com/s3/object/aws-data-lab-sanhe-for-everything-us-east-2?prefix=poc/learn-big-data-on-aws/glue-job-examples/04-glue-job-best-practice/data-validation/3.json


## Create Glue Catalog

In [8]:
import boto3

boto_ses = boto3.session.Session(region_name=config.region)
glue_client = boto_ses.client("glue")
sts_client = boto_ses.client("sts")

account_id = sts_client.get_caller_identity()["Account"]
print(f"aws account id = {account_id}")

aws account id = 669508176277


In [26]:
from botocore.exceptions import ClientError

try:
    response = glue_client.get_database(
        CatalogId=account_id,
        Name=config.database,
    )
except ClientError as e: 
    glue_client.create_database(
        CatalogId=account_id,
        DatabaseInput=dict(
            Name=config.database,
            Description="For project https://github.com/MacHu-GWU/learn_big_data_on_aws-project",
        )
    )
    
try:
    response = glue_client.get_table(
        CatalogId=account_id,
        DatabaseName=config.database,
        Name=config.table,
    )
except ClientError as e: 
    response = glue_client.create_table(
        CatalogId=account_id,
        DatabaseName=config.database,
        TableInput=dict(
            Name=config.table,
            StorageDescriptor=dict(
                Columns=[
                    dict(
                        Name="id",
                        Type="integer",
                    ),
                    dict(
                        Name="name",
                        Type="string",
                    ),
                    dict(
                        Name="balance",
                        Type="integer",
                    ),
                ],
                Location=config.s3path_prefix.uri,
                InputFormat="org.apache.hadoop.mapred.TextInputFormat",
                OutputFormat="org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
                SerdeInfo=dict(
                    Name="org.openx.data.jsonserde.JsonSerDe",
                ),
            ),
            Parameters=dict(
                classification="json",
            ),
        ),
    )