In [1]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
from rich import print as rprint
from faker import Faker
from s3pathlib import S3Path
import pandas as pd

In [3]:
class Config:
    bucket = "aws-data-lab-sanhe-for-everything-us-east-2"
    prefix = "poc/learn-big-data-on-aws/glue-job-examples/03-transformation-examples/05-flatten-and-unnest-json"
    n_files = 10
    n_records_per_file = 1000
    n_product_category = 20
    
    @property
    def s3path_prefix(self):
        return S3Path(self.bucket, self.prefix)
    
config = Config()

In [4]:
import random

fake = Faker()

product_categories = list(set([fake.word() for i in range(config.n_product_category)]))

def create_one(nth_file: int):
    product_id_start = 1 + (nth_file - 1) * config.n_records_per_file
    product_id_end = product_id_start + config.n_records_per_file
    data = [
        {
            "id": product_id,
            "name": fake.word(),
            "price": random.randint(1, 100),
            "specs": {
                "color": fake.color_name()
            },
            "categories": random.sample(product_categories, random.randint(1, 3)),
            "reviews": [
                {"rank": random.randint(1, 5), "comment": fake.sentence()}
                for _ in range(random.randint(0, 5))
            ]
        }
        for product_id in range(product_id_start, product_id_end)
    ]
    df = pd.DataFrame(data)
    return df

df = create_one(1)
rprint(df.head(3).to_dict(orient="records"))

In [5]:
def create_many():
    for nth_file in range(1, 1+config.n_files):
        print(f"processing {nth_file} th file ...")
        df = create_one(nth_file)
        s3path = S3Path(config.bucket, config.prefix, f"{str(nth_file).zfill(3)}.json")
        with s3path.open("w") as f:
            df.to_json(f, orient="records", lines=True)
        print(f"  open for preview: {s3path.console_url}")
    
create_many()

processing 1 th file ...
  open for preview: https://console.aws.amazon.com/s3/object/aws-data-lab-sanhe-for-everything-us-east-2?prefix=poc/learn-big-data-on-aws/glue-job-examples/03-transformation-examples/05-flatten-and-unnest-json/001.json
processing 2 th file ...
  open for preview: https://console.aws.amazon.com/s3/object/aws-data-lab-sanhe-for-everything-us-east-2?prefix=poc/learn-big-data-on-aws/glue-job-examples/03-transformation-examples/05-flatten-and-unnest-json/002.json
processing 3 th file ...
  open for preview: https://console.aws.amazon.com/s3/object/aws-data-lab-sanhe-for-everything-us-east-2?prefix=poc/learn-big-data-on-aws/glue-job-examples/03-transformation-examples/05-flatten-and-unnest-json/003.json
processing 4 th file ...
  open for preview: https://console.aws.amazon.com/s3/object/aws-data-lab-sanhe-for-everything-us-east-2?prefix=poc/learn-big-data-on-aws/glue-job-examples/03-transformation-examples/05-flatten-and-unnest-json/004.json
processing 5 th file ...