In [1]:
import sys
from pyspark.context import SparkContext
from awsglue.job import Job
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.transforms import *

# Create SparkContext
sparkContext = SparkContext.getOrCreate()

# Create Glue Context
glueContext = GlueContext(sparkContext)

# Get spark session
spark = glueContext.spark_session

# Resolve job parameters
# Uncomment this in Glue ETL job
# args = getResolvedOptions(sys.argv, ["JOB_NAME"
# job = Job(glueContext)
# job.init(args['JOB_NAME'], args)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
0,application_1646085135716_0025,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
from s3pathlib import S3Path
from marshmallow import fields, Schema, validates, ValidationError

class Config:
    bucket = "aws-data-lab-sanhe-for-everything-us-east-2"
    prefix = "poc/learn-big-data-on-aws/glue-job-examples/04-glue-job-best-practice/data-validation"
    
    @property
    def s3path_prefix(self) -> S3Path:
        return S3Path(self.bucket, self.prefix)
    
config = Config()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…



In [3]:
class RowSchema(Schema):
    id = fields.Int()
    name = fields.Str(required=True)
    balance = fields.Int()
    
    @validates("balance")
    def validate_balance(self, value):
        if value < 0:
            raise ValidationError("Balance must be greater than 0.")

schema = RowSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
schema.validate({"id": 1, "name": "user1", "balance": 100})

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{}

In [5]:
# unknown field
schema.validate({"id": 15, "name": "user15", "balance": 1500, "password": "123456"})

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{'password': ['Unknown field.']}

In [6]:
# field missing
schema.validate({"id": 20, "balance": 2000})

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{'name': ['Missing data for required field.']}

In [7]:
# type error
schema.validate({"id": 23, "name": "user23", "balance": "2,300"})

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{'balance': ['Not a valid integer.']}

In [8]:
# value range error
schema.validate({"id": 26, "name": "user26", "balance": -2600})

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{'balance': ['Balance must be greater than 0.']}

In [9]:
# not null value
schema.validate({"id": 30, "name": None, "balance": 3000})

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{'name': ['Field may not be null.']}

In [10]:
import pynamodb
from pynamodb.models import Model
from pynamodb.connection import Connection
from pynamodb.attributes import UnicodeAttribute, NumberAttribute, JSONAttribute

connection = Connection(region="us-east-2")

class ValidationTracker(Model):
    class Meta:
        """
        declare metadata about the table
        """
        table_name = "learn_big_data_on_aws_glue_validation"
        region = "us-east-2"

        # billing mode
        # doc: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.ReadWriteCapacityMode.html
        # pay as you go mode
        billing_mode = pynamodb.models.PAY_PER_REQUEST_BILLING_MODE

        # provisioned mode
        # write_capacity_units = 10
        # read_capacity_units = 10

    # define attributes
    s3uri = UnicodeAttribute(hash_key=True)
    status = NumberAttribute(default=0) # set default value for attribute
    details = JSONAttribute(default={})


# Create dynamodb table if not exists, if already exists, this code won't do anything
ValidationTracker.create_table(wait=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
gdf1 = glueContext.create_dynamic_frame.from_options(
    connection_type="s3", 
    connection_options=dict(
        paths=[
            S3Path(config.s3path_prefix, "1.json").uri,
        ],
        recurse=True,
    ),
    format="json",
    format_options=dict(multiLine=True),
    transformation_ctx="datasource",
)

In [None]:
gdf1.toDF().show()

In [24]:
gdf3 = glueContext.create_dynamic_frame.from_options(
    connection_type="s3", 
    connection_options=dict(
        paths=[
            S3Path(config.s3path_prefix, "3.json").uri,
        ],
        recurse=True,
    ),
    format="json",
    format_options=dict(multiLine=True),
    transformation_ctx="datasource",
)
for row in gdf3.toDF().collect():
    print(row.asDict(recursive=True))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{'id': 21, 'name': 'user21', 'balance': {'int': 2100, 'string': None}}
{'id': 22, 'name': 'user22', 'balance': {'int': 2200, 'string': None}}
{'id': 23, 'name': 'user23', 'balance': {'int': None, 'string': '2,300'}}
{'id': 24, 'name': 'user24', 'balance': {'int': 2400, 'string': None}}
{'id': 25, 'name': 'user25', 'balance': {'int': 2500, 'string': None}}
{'id': 26, 'name': 'user26', 'balance': {'int': -2600, 'string': None}}
{'id': 27, 'name': 'user27', 'balance': {'int': 2700, 'string': None}}
{'id': 28, 'name': 'user28', 'balance': {'int': 2800, 'string': None}}
{'id': 29, 'name': 'user29', 'balance': {'int': 2900, 'string': None}}
{'id': 30, 'name': None, 'balance': {'int': 3000, 'string': None}}

In [25]:
def validate_row(row: dict):
    res = schema.validate(row)
    if res:
        return False
    else:
        return True
        
gdf3_filtered = gdf3.filter(validate_row)
gdf3_filtered.toDF().show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+------+---+
|balance|  name| id|
+-------+------+---+
|   2100|user21| 21|
|   2200|user22| 22|
|   2400|user24| 24|
|   2500|user25| 25|
|   2700|user27| 27|
|   2800|user28| 28|
|   2900|user29| 29|
+-------+------+---+

In [26]:
gdf = glueContext.create_dynamic_frame.from_options(
    connection_type="s3", 
    connection_options=dict(
        paths=[
            S3Path(config.s3path_prefix).uri,
        ],
        recurse=True,
    ),
    format="json",
    format_options=dict(multiLine=True),
    transformation_ctx="datasource",
)
gdf.toDF().show()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+------+---------+--------+
| id|  name|  balance|password|
+---+------+---------+--------+
| 18|user18|  [1800,]|    null|
| 29|user29|  [2900,]|    null|
|  1| user1|   [100,]|    null|
| 19|user19|  [1900,]|    null|
| 30|  null|  [3000,]|    null|
|  2| user2|   [200,]|    null|
| 20|  null|  [2000,]|    null|
|  3| user3|   [300,]|    null|
|  4| user4|   [400,]|    null|
|  5| user5|   [500,]|    null|
|  6| user6|   [600,]|    null|
|  7| user7|   [700,]|    null|
|  8| user8|   [800,]|    null|
|  9| user9|   [900,]|    null|
| 10|user10|  [1000,]|    null|
| 21|user21|  [2100,]|    null|
| 11|user11|  [1100,]|    null|
| 22|user22|  [2200,]|    null|
| 12|user12|  [1200,]|    null|
| 23|user23|[, 2,300]|    null|
+---+------+---------+--------+
only showing top 20 rows

In [29]:
gdf_selected = gdf.select_fields(["id", "name", "balance"])
gdf_selected.toDF().show(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+------+-------+
| id|  name|balance|
+---+------+-------+
| 18|user18|[1800,]|
| 29|user29|[2900,]|
|  1| user1| [100,]|
+---+------+-------+
only showing top 3 rows