# Status Tracking

```python
{
    "trans_id": "uuid", # str
    "from_acc": "acc-1", # str
    "to_acc": "acc-2", # str
    "amount": 150, # int
    "created_time": datetime, # datetime
}
```

In [1]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import uuid
import json
import random
from datetime import datetime, timedelta, timezone
from s3pathlib import S3Path
from rich import print as rprint


In [7]:
class Config:
    region = "us-east-2"
    source_bucket = "aws-data-lab-sanhe-for-everything-us-east-2"
    source_prefix = "poc/learn-big-data-on-aws/glue-job-examples/04-glue-job-best-practice/status-tracking/bank_transaction/source/"
    target_bucket = "aws-data-lab-sanhe-for-everything-us-east-2"
    target_prefix = "poc/learn-big-data-on-aws/glue-job-examples/04-glue-job-best-practice/status-tracking/bank_transaction/target/"
    
    n_file = 100 # 一共多少个文件
    n_rows_per_file = 1000 # 每个文件由多少行
    n_acc = 20000 # 模拟多少个银行账户互相转账
    failed_rate = 10 # 按照百分之几的比例创建 "坏" 文件, 5 就是 5%
    
    @property
    def s3path_source(self) -> S3Path:
        return S3Path(self.source_bucket, self.source_prefix)

    @property
    def s3path_target(self) -> S3Path:
        return S3Path(self.target_bucket, self.target_prefix)
    
config = Config()

In [4]:
def create_one(ith: int, verbose=True):
    if verbose:
        print(f"processing {ith} th file ...")
    bank_epoch = datetime(2000, 1, 1, tzinfo=timezone.utc)
    s3path = S3Path(config.s3path_source, "{}.json".format(str(ith).zfill(6)))
    data = []
    for _ in range(config.n_rows_per_file):
        trans_id=str(uuid.uuid4())
        from_acc = random.randint(1, config.n_acc)
        to_acc = random.randint(1, config.n_acc)
        balance = random.randint(1, 100)
        created_time = bank_epoch + timedelta(seconds=random.randint(1, 31_536_000))
        row = dict(
            trans_id=trans_id,
            from_acc=from_acc,
            to_acc=to_acc,
            balance=balance,
            created_time=created_time.isoformat(),
        )
        data.append(row)
    if random.randint(1, 100) <= config.failed_rate:
        # randomly insert 1 ~ 5 bad records
        for _ in range(random.randint(1, 5)):
            row_id = random.randint(0, config.n_rows_per_file - 1)
            data[row_id] = dict(id=1, name="alice") # this is not a valid transaction data
    lines = list()
    for row in data:
        lines.append(json.dumps(row))
    s3path.write_text("\n".join(lines))
    if verbose:
        print(f"  open for preview: {s3path.console_url}")

create_one(1, verbose=True)

processing 1 th file ...
  open for preview: https://console.aws.amazon.com/s3/object/aws-data-lab-sanhe-for-everything-us-east-2?prefix=poc/learn-big-data-on-aws/glue-job-examples/04-glue-job-best-practice/status-tracking/bank_transaction/source/000001.json


In [5]:
# Create many dummy files
for ith in range(1, 1 + config.n_file):
    create_one(ith, verbose=False)
print("done")

done


In [10]:
# Check how many rows are created
import pandas as pd

total_rows = 0
for s3path in config.s3path_source.iter_objects():
    with s3path.open("r") as f:
        df = pd.read_json(f, orient="records", lines=True)
        total_rows += df.shape[0]
        
print(f"total number of rows = {total_rows}")

total number of rows = 100000
