In [16]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [16]:
import json
import random
from typing import List, Dict

import pandas as pd
from rich import print as rprint
from faker import Faker
from s3pathlib import S3Path

In [3]:
class Config:
    bucket = "aws-data-lab-sanhe-for-everything-us-east-2"
    prefix = "poc/learn-big-data-on-aws/glue-job-examples/missing-fields-and-null-values"
    n_files = 3
    n_records_per_file = 100
    
    @property
    def s3path_prefix(self):
        return S3Path(self.bucket, self.prefix)
    
config = Config()

In [19]:
def json_dump_multiline(data: List[dict]) -> str:
    """
    dump list of dictionary to multi line json string content.
    """
    lines = list()
    for dct in data:
        lines.append(json.dumps(dct))
    content = "\n".join(lines)
    return content

In [22]:

fake = Faker()


def create_one(
    nth_file: int, 
    has_bad_data: bool=True, 
    write_to_s3: bool=True,
):
    print(f"processing {nth_file} th file ...")
    id_start = 1 + (nth_file - 1) * config.n_records_per_file
    id_end = id_start + config.n_records_per_file
    data = [
        {
            "id": id,
            "with_missing_field": random.randint(1, 100),
            "with_null_value": random.randint(1, 100)
        }
        for id in range(id_start, id_end)
    ]
    
    if has_bad_data:
        for dct in data:
            if random.randint(1, 100) <= 30:
                del dct["with_missing_field"]
            if random.randint(1, 100) <= 30:
                dct["with_null_value"] = None

    if write_to_s3:
        content = json_dump_multiline(data)
        s3path = S3Path(config.bucket, config.prefix, f"{str(nth_file).zfill(3)}.json")
        s3path.write_text(content)
        print(f"  open for preview: {s3path.console_url}")

    return data

data = create_one(1, has_bad_data=True, write_to_s3=False)
rprint(data[:30])

processing 1 th file ...


In [23]:
def create_many():
    create_one(1, has_bad_data=True)
    create_one(2, has_bad_data=True)
    create_one(3, has_bad_data=False)
    
create_many()

processing 1 th file ...
  open for preview: https://console.aws.amazon.com/s3/object/aws-data-lab-sanhe-for-everything-us-east-2?prefix=poc/learn-big-data-on-aws/glue-job-examples/missing-fields-and-null-values/001.json
processing 2 th file ...
  open for preview: https://console.aws.amazon.com/s3/object/aws-data-lab-sanhe-for-everything-us-east-2?prefix=poc/learn-big-data-on-aws/glue-job-examples/missing-fields-and-null-values/002.json
processing 3 th file ...
  open for preview: https://console.aws.amazon.com/s3/object/aws-data-lab-sanhe-for-everything-us-east-2?prefix=poc/learn-big-data-on-aws/glue-job-examples/missing-fields-and-null-values/003.json
