In [1]:
# Parameters
RUN_DATE = "2025-10-08"


<a href="https://colab.research.google.com/github/HieuNguyenPhi/ADJ_JOBS/blob/main/notebooks/ADJUST_JOB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
from azure.storage.blob import BlobServiceClient

account_name = os.getenv('ACCOUNT_NAME')
account_key = os.getenv('ACCOUNT_KEY')
# Replace with your Azure Storage account name and SAS token or connection string
connect_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
container_list = blob_service_client.list_containers()
container_name = "adjuststbuatprocessed" #os.getenv('CONTAINER_NAME')
container_client = blob_service_client.get_container_client(container_name)
already_processed = [file.name.split('/')[1].split('.')[0] for file in container_client.list_blobs() if file.name.split('/')[0] == 'output']
already_processed[-5:]

['2025-10-03', '2025-10-04', '2025-10-05', '2025-10-06', '2025-10-07']

In [3]:
from datetime import date
import pandas as pd
today = date.today().strftime('%Y-%m-%d')
need_process = pd.date_range(start=already_processed[-2], end=today).strftime('%Y-%m-%d').to_list()
need_process

['2025-10-06', '2025-10-07', '2025-10-08']

In [4]:
container_name_uat = "adjuststbuat"
container_client_uat = blob_service_client.get_container_client(container_name_uat)
from collections import defaultdict
files = [i.name for i in container_client_uat.list_blobs()]
groups = defaultdict(list)
for f in files:
    dt = f.split('_')[1]
    groups[dt].append(f)
groups[dt]

['rsh20bkkb4zk_2025-10-08T000000_762c775ae454d23f2c6b6a75623d14c7_2853a0.csv.gz',
 'rsh20bkkb4zk_2025-10-08T000000_762c775ae454d23f2c6b6a75623d14c7_2853a1.csv.gz',
 'rsh20bkkb4zk_2025-10-08T000000_762c775ae454d23f2c6b6a75623d14c7_c35750.csv.gz',
 'rsh20bkkb4zk_2025-10-08T000000_762c775ae454d23f2c6b6a75623d14c7_c35751.csv.gz']

In [5]:
from pathlib import Path
root = Path.cwd()
process_path = f'{root}/process/adjust_uat'
from tqdm import tqdm
import polars as pl

os.makedirs(process_path, exist_ok=True)

storage_options = {
    "account_name": account_name,
    "account_key":  account_key,
}

for ts, files in tqdm(groups.items()):
    dt = ts[:10]                       # "2025-06-25" -> partition dt=...
    if dt not in need_process:
        continue
    # break
    partition_dir = os.path.join(process_path, f"dt={dt}")
    os.makedirs(partition_dir, exist_ok=True)
    out_file = os.path.join(partition_dir, f"{ts}.parquet")

    dfs = []
    for f in files:
        df = (pl.scan_csv(f"az://adjuststbuat/{f}",                             # eager
                          storage_options=storage_options,
                          has_header=True,
                          null_values=["", "NULL"])       # rỗng → null
                .select(pl.all().cast(pl.Utf8)))          # tất cả cột → string
        dfs.append(df)

    df_all = pl.concat(dfs, how="diagonal")               # tự thêm null cột thiếu
    df_all.sink_parquet(out_file, compression="snappy")
    print(f'Done dt={dt}/{ts}.parquet')


  0%|          | 0/1834 [00:00<?, ?it/s]

 97%|█████████▋| 1786/1834 [00:01<00:00, 1646.15it/s]

Done dt=2025-10-06/2025-10-06T000000.parquet


Done dt=2025-10-06/2025-10-06T010000.parquet


Done dt=2025-10-06/2025-10-06T020000.parquet


Done dt=2025-10-06/2025-10-06T030000.parquet


Done dt=2025-10-06/2025-10-06T040000.parquet


Done dt=2025-10-06/2025-10-06T050000.parquet


Done dt=2025-10-06/2025-10-06T060000.parquet


 97%|█████████▋| 1786/1834 [00:19<00:00, 1646.15it/s]

 98%|█████████▊| 1793/1834 [00:22<00:00, 56.69it/s]  

Done dt=2025-10-06/2025-10-06T070000.parquet


 98%|█████████▊| 1794/1834 [00:25<00:00, 47.25it/s]

Done dt=2025-10-06/2025-10-06T080000.parquet


Done dt=2025-10-06/2025-10-06T090000.parquet


Done dt=2025-10-06/2025-10-06T100000.parquet


Done dt=2025-10-06/2025-10-06T110000.parquet


Done dt=2025-10-06/2025-10-06T120000.parquet


Done dt=2025-10-06/2025-10-06T130000.parquet


 98%|█████████▊| 1794/1834 [00:40<00:00, 47.25it/s]

 98%|█████████▊| 1800/1834 [00:41<00:01, 21.26it/s]

Done dt=2025-10-06/2025-10-06T140000.parquet


 98%|█████████▊| 1801/1834 [00:43<00:01, 19.32it/s]

Done dt=2025-10-06/2025-10-06T150000.parquet


Done dt=2025-10-06/2025-10-06T160000.parquet


Done dt=2025-10-06/2025-10-06T170000.parquet


Done dt=2025-10-06/2025-10-06T180000.parquet


Done dt=2025-10-06/2025-10-06T190000.parquet


Done dt=2025-10-06/2025-10-06T200000.parquet


Done dt=2025-10-06/2025-10-06T210000.parquet


Done dt=2025-10-06/2025-10-06T220000.parquet


 98%|█████████▊| 1801/1834 [01:00<00:01, 19.32it/s]

 99%|█████████▊| 1809/1834 [01:00<00:02,  9.64it/s]

Done dt=2025-10-06/2025-10-06T230000.parquet


 99%|█████████▊| 1810/1834 [01:02<00:02,  8.87it/s]

Done dt=2025-10-07/2025-10-07T000000.parquet


Done dt=2025-10-07/2025-10-07T010000.parquet


Done dt=2025-10-07/2025-10-07T020000.parquet


Done dt=2025-10-07/2025-10-07T030000.parquet


Done dt=2025-10-07/2025-10-07T040000.parquet


Done dt=2025-10-07/2025-10-07T050000.parquet


 99%|█████████▊| 1810/1834 [01:20<00:02,  8.87it/s]

 99%|█████████▉| 1816/1834 [01:20<00:03,  4.56it/s]

Done dt=2025-10-07/2025-10-07T060000.parquet


 99%|█████████▉| 1817/1834 [01:23<00:04,  4.07it/s]

Done dt=2025-10-07/2025-10-07T070000.parquet


Done dt=2025-10-07/2025-10-07T080000.parquet


Done dt=2025-10-07/2025-10-07T090000.parquet


Done dt=2025-10-07/2025-10-07T100000.parquet


Done dt=2025-10-07/2025-10-07T110000.parquet


 99%|█████████▉| 1817/1834 [01:40<00:04,  4.07it/s]

 99%|█████████▉| 1822/1834 [01:40<00:05,  2.34it/s]

Done dt=2025-10-07/2025-10-07T120000.parquet


 99%|█████████▉| 1823/1834 [01:43<00:05,  2.11it/s]

Done dt=2025-10-07/2025-10-07T130000.parquet


Done dt=2025-10-07/2025-10-07T140000.parquet


Done dt=2025-10-07/2025-10-07T150000.parquet


Done dt=2025-10-07/2025-10-07T160000.parquet


Done dt=2025-10-07/2025-10-07T170000.parquet


Done dt=2025-10-07/2025-10-07T180000.parquet


Done dt=2025-10-07/2025-10-07T190000.parquet


 99%|█████████▉| 1823/1834 [02:00<00:05,  2.11it/s]

100%|█████████▉| 1830/1834 [02:00<00:02,  1.34it/s]

Done dt=2025-10-07/2025-10-07T200000.parquet


100%|█████████▉| 1831/1834 [02:02<00:02,  1.28it/s]

Done dt=2025-10-07/2025-10-07T210000.parquet


Done dt=2025-10-07/2025-10-07T220000.parquet


Done dt=2025-10-07/2025-10-07T230000.parquet


100%|██████████| 1834/1834 [02:08<00:00, 14.26it/s]

Done dt=2025-10-08/2025-10-08T000000.parquet





In [6]:
output_path = f'{root}/output/adjust_uat'
os.makedirs(output_path, exist_ok=True)
import glob
import shutil
for dt_folder in tqdm(glob.glob(os.path.join(process_path, "dt=*"))):
    dt = os.path.basename(dt_folder)[3:]                 # "2025-06-25"
    files_pq = glob.glob(os.path.join(dt_folder, "*T*.parquet"))
    if not files_pq:
        continue

    out_path = os.path.join(output_path, f"{dt}.parquet")

    # Nếu trước đó lỡ tạo cùng tên dưới dạng DIR → xoá
    if os.path.isdir(out_path):
        shutil.rmtree(out_path)

    # ---------- ❶  Lazy scan tất cả Parquet ----------
    lfs = [pl.scan_parquet(f) for f in files_pq]          # mỗi file → LazyFrame

    # ---------- ❷  Concat diagonal + giữ schema linh hoạt ----------
    lf_day = (
        pl.concat(lfs, how="diagonal")                    # tự thêm null cột thiếu
        .select(pl.all().cast(pl.Utf8))                   # đảm bảo mọi cột = string
        .with_columns(pl.lit(dt).alias("dt"))             # thêm cột partition (tuỳ)
    )

    # ---------- ❸  Ghi duy nhất 1 Parquet ----------
    lf_day.sink_parquet(out_path, compression="snappy")

  0%|          | 0/3 [00:00<?, ?it/s]

 67%|██████▋   | 2/3 [00:00<00:00, 18.46it/s]

100%|██████████| 3/3 [00:00<00:00, 14.68it/s]




In [7]:

local_folder_path = f"{root}/process/adjust_uat"

# Replace with the desired folder name in the Azure container
azure_folder_name = "processing"

# Iterate through files in the local folder and upload them
for root, dirs, files in tqdm(os.walk(local_folder_path)):
    for file in files:
        # Construct the full local file path
        local_file_path = os.path.join(root, file)

        # Construct the blob name (path within the Azure container)
        # This preserves the folder structure from the local path
        relative_path = os.path.relpath(local_file_path, local_folder_path)
        blob_name = os.path.join(azure_folder_name, relative_path)
        # print(blob_name)
        # Create a blob client for the current file
        blob_client = container_client.get_blob_client(blob_name)

        print(f"Uploading {local_file_path} to {container_name}/{blob_name}")

#         # Upload the file
        with open(local_file_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)

print("\nFolder upload complete.")

0it [00:00, ?it/s]

Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T090000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T090000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T200000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T200000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T110000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T110000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T190000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T190000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T100000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T100000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T060000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T060000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T120000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T120000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T230000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T230000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T170000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T170000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T070000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T070000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T050000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T050000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T210000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T210000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T160000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T160000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T000000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T000000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T010000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T010000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T080000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T080000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T040000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T040000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T020000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T020000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T180000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T180000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T030000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T030000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T130000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T130000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T220000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T220000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T150000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T150000.parquet


2it [00:04,  2.45s/it]

Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T140000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T140000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-08/2025-10-08T000000.parquet to adjuststbuatprocessed/processing/dt=2025-10-08/2025-10-08T000000.parquet


3it [00:05,  1.51s/it]

Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T230000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T230000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T060000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T060000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T170000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T170000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T050000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T050000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T090000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T090000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T040000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T040000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T190000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T190000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T020000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T020000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T200000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T200000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T130000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T130000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T220000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T220000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T070000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T070000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T110000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T110000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T180000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T180000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T150000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T150000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T100000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T100000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T010000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T010000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T030000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T030000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T000000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T000000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T080000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T080000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T160000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T160000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T120000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T120000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T140000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T140000.parquet
Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T210000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T210000.parquet


4it [00:09,  2.57s/it]

4it [00:09,  2.36s/it]


Folder upload complete.





In [8]:

local_folder_path = f"{root}/output/adjust_uat"

# Replace with the desired folder name in the Azure container
azure_folder_name = "output"

# Iterate through files in the local folder and upload them
for root, dirs, files in tqdm(os.walk(local_folder_path)):
    for file in files:
        # Construct the full local file path
        local_file_path = os.path.join(root, file)

        # Construct the blob name (path within the Azure container)
        # This preserves the folder structure from the local path
        relative_path = os.path.relpath(local_file_path, local_folder_path)
        blob_name = os.path.join(azure_folder_name, relative_path)
        # print(blob_name)
        # Create a blob client for the current file
        blob_client = container_client.get_blob_client(blob_name)

        print(f"Uploading {local_file_path} to {container_name}/{blob_name}")

#         # Upload the file
        with open(local_file_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)

print("\nFolder upload complete.")

0it [00:00, ?it/s]

0it [00:00, ?it/s]


Folder upload complete.





In [9]:
# Replace with the path to the local folder you want to delete
local_folder_paths = [f"{root}/process/adjust_uat",f"{root}/output/adjust_uat"]
# local_folder_path = f"data/process/adjust_live"
for local_folder_path in local_folder_paths:
    if os.path.exists(local_folder_path):
        print(f"Deleting local folder: {local_folder_path}")
        shutil.rmtree(local_folder_path)
        print("Local folder deleted.")
    else:
        print(f"Local folder not found: {local_folder_path}")

Local folder not found: /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_uat
Local folder not found: /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/output/adjust_uat


# Live

In [10]:
already_processed = [file.name.split('/')[-1].split('.')[0] for file in container_client.list_blobs() if file.name[:12] == 'live/output/']
already_processed[-5:]

['2025-10-03', '2025-10-04', '2025-10-05', '2025-10-06', '2025-10-07']

In [11]:
need_process = pd.date_range(start=already_processed[-1], end=today).strftime('%Y-%m-%d').to_list()
need_process

['2025-10-07', '2025-10-08']

In [12]:
container_name_uat = "adjuststblive"
container_client_uat = blob_service_client.get_container_client(container_name_uat)
from collections import defaultdict
files = [i.name for i in container_client_uat.list_blobs()]
groups = defaultdict(list)
for f in files:
    dt = f.split('_')[1]
    groups[dt].append(f)
groups[dt]

['65n1fgov4zr4_2025-10-08T000000_762c775ae454d23f2c6b6a75623d14c7_2853a0.csv.gz',
 '65n1fgov4zr4_2025-10-08T000000_762c775ae454d23f2c6b6a75623d14c7_2853a1.csv.gz',
 '65n1fgov4zr4_2025-10-08T000000_762c775ae454d23f2c6b6a75623d14c7_be8220.csv.gz',
 '65n1fgov4zr4_2025-10-08T000000_762c775ae454d23f2c6b6a75623d14c7_be8221.csv.gz',
 '65n1fgov4zr4_2025-10-08T000000_762c775ae454d23f2c6b6a75623d14c7_c35750.csv.gz',
 '65n1fgov4zr4_2025-10-08T000000_762c775ae454d23f2c6b6a75623d14c7_c35751.csv.gz']

In [13]:
process_path = f'{root}/process/adjust_live'
os.makedirs(process_path, exist_ok=True)

for ts, files in tqdm(groups.items()):
    # if ts not in need_process:
    #     continue
    dt = ts[:10]                       # "2025-06-25" -> partition dt=...
    # print(ts)
    # break
    if dt not in need_process:
        continue
    # break
    partition_dir = os.path.join(process_path, f"dt={dt}")
    os.makedirs(partition_dir, exist_ok=True)
    out_file = os.path.join(partition_dir, f"{ts}.parquet")

    dfs = []
    for f in files:
        df = (pl.scan_csv(f"az://adjuststblive/{f}",                             # eager
                          storage_options=storage_options,
                          has_header=True,
                          null_values=["", "NULL"],
                          ignore_errors=True)       # rỗng → null
                .select(pl.all().cast(pl.Utf8)))          # tất cả cột → string
        dfs.append(df)

    df_all = pl.concat(dfs, how="diagonal")               # tự thêm null cột thiếu
    df_all.sink_parquet(out_file, compression="snappy")
    print(f'Done dt={dt}/{ts}.parquet')

  0%|          | 0/1815 [00:00<?, ?it/s]

 99%|█████████▊| 1791/1815 [00:14<00:00, 122.76it/s]

Done dt=2025-10-07/2025-10-07T000000.parquet


Done dt=2025-10-07/2025-10-07T010000.parquet


 99%|█████████▊| 1791/1815 [00:28<00:00, 122.76it/s]

 99%|█████████▉| 1793/1815 [00:38<00:00, 37.15it/s] 

Done dt=2025-10-07/2025-10-07T020000.parquet


 99%|█████████▉| 1794/1815 [00:51<00:00, 24.11it/s]

Done dt=2025-10-07/2025-10-07T030000.parquet


 99%|█████████▉| 1795/1815 [01:04<00:01, 16.00it/s]

Done dt=2025-10-07/2025-10-07T040000.parquet


 99%|█████████▉| 1796/1815 [01:16<00:01, 11.07it/s]

Done dt=2025-10-07/2025-10-07T050000.parquet


 99%|█████████▉| 1797/1815 [01:26<00:02,  7.97it/s]

Done dt=2025-10-07/2025-10-07T060000.parquet


 99%|█████████▉| 1798/1815 [01:38<00:03,  5.52it/s]

Done dt=2025-10-07/2025-10-07T070000.parquet


 99%|█████████▉| 1799/1815 [01:52<00:04,  3.68it/s]

Done dt=2025-10-07/2025-10-07T080000.parquet


 99%|█████████▉| 1800/1815 [02:06<00:05,  2.50it/s]

Done dt=2025-10-07/2025-10-07T090000.parquet


 99%|█████████▉| 1801/1815 [02:21<00:08,  1.70it/s]

Done dt=2025-10-07/2025-10-07T100000.parquet


 99%|█████████▉| 1802/1815 [02:35<00:10,  1.19it/s]

Done dt=2025-10-07/2025-10-07T110000.parquet


 99%|█████████▉| 1803/1815 [02:49<00:14,  1.17s/it]

Done dt=2025-10-07/2025-10-07T120000.parquet


 99%|█████████▉| 1804/1815 [03:01<00:17,  1.57s/it]

Done dt=2025-10-07/2025-10-07T130000.parquet


 99%|█████████▉| 1805/1815 [03:10<00:19,  1.95s/it]

Done dt=2025-10-07/2025-10-07T140000.parquet


100%|█████████▉| 1806/1815 [03:20<00:22,  2.47s/it]

Done dt=2025-10-07/2025-10-07T150000.parquet


100%|█████████▉| 1807/1815 [03:26<00:21,  2.73s/it]

Done dt=2025-10-07/2025-10-07T160000.parquet


100%|█████████▉| 1808/1815 [03:30<00:20,  2.88s/it]

Done dt=2025-10-07/2025-10-07T170000.parquet


100%|█████████▉| 1809/1815 [03:34<00:18,  3.01s/it]

Done dt=2025-10-07/2025-10-07T180000.parquet


100%|█████████▉| 1810/1815 [03:38<00:15,  3.11s/it]

Done dt=2025-10-07/2025-10-07T190000.parquet


100%|█████████▉| 1811/1815 [03:41<00:12,  3.21s/it]

Done dt=2025-10-07/2025-10-07T200000.parquet


100%|█████████▉| 1812/1815 [03:45<00:09,  3.33s/it]

Done dt=2025-10-07/2025-10-07T210000.parquet


100%|█████████▉| 1813/1815 [03:49<00:07,  3.56s/it]

Done dt=2025-10-07/2025-10-07T220000.parquet


100%|█████████▉| 1814/1815 [03:58<00:04,  4.81s/it]

Done dt=2025-10-07/2025-10-07T230000.parquet


100%|██████████| 1815/1815 [04:08<00:00,  6.12s/it]

100%|██████████| 1815/1815 [04:08<00:00,  7.31it/s]

Done dt=2025-10-08/2025-10-08T000000.parquet





In [14]:
output_path = f'{root}/output/adjust_live'
os.makedirs(output_path, exist_ok=True)
for dt_folder in tqdm(glob.glob(os.path.join(process_path, "dt=*"))):
    dt = os.path.basename(dt_folder)[3:]                 # "2025-06-25"
    files_pq = glob.glob(os.path.join(dt_folder, "*T*.parquet"))
    if not files_pq:
        continue

    out_path = os.path.join(output_path, f"{dt}.parquet")

    # Nếu trước đó lỡ tạo cùng tên dưới dạng DIR → xoá
    if os.path.isdir(out_path):
        shutil.rmtree(out_path)

    # ---------- ❶  Lazy scan tất cả Parquet ----------
    lfs = [pl.scan_parquet(f) for f in files_pq]          # mỗi file → LazyFrame

    # ---------- ❷  Concat diagonal + giữ schema linh hoạt ----------
    lf_day = (
        pl.concat(lfs, how="diagonal")                    # tự thêm null cột thiếu
        .select(pl.all().cast(pl.Utf8))                   # đảm bảo mọi cột = string
        .with_columns(pl.lit(dt).alias("dt"))             # thêm cột partition (tuỳ)
    )

    # ---------- ❸  Ghi duy nhất 1 Parquet ----------
    lf_day.sink_parquet(out_path, compression="snappy")

  0%|          | 0/2 [00:00<?, ?it/s]

 50%|█████     | 1/2 [00:01<00:01,  1.30s/it]

100%|██████████| 2/2 [00:31<00:00, 18.53s/it]

100%|██████████| 2/2 [00:31<00:00, 15.95s/it]




In [15]:

local_folder_path = f"{root}/process/adjust_live"

# Replace with the desired folder name in the Azure container
azure_folder_name = "live/processing"

# Iterate through files in the local folder and upload them
for root, dirs, files in tqdm(os.walk(local_folder_path)):
    for file in files:
        # Construct the full local file path
        local_file_path = os.path.join(root, file)

        # Construct the blob name (path within the Azure container)
        # This preserves the folder structure from the local path
        relative_path = os.path.relpath(local_file_path, local_folder_path)
        blob_name = os.path.join(azure_folder_name, relative_path)
        # print(blob_name)
        # Create a blob client for the current file
        blob_client = container_client.get_blob_client(blob_name)

        print(f"Uploading {local_file_path} to {container_name}/{blob_name}")

#         # Upload the file
        with open(local_file_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)

print("\nFolder upload complete.")

0it [00:00, ?it/s]

Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-08/2025-10-08T000000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-08/2025-10-08T000000.parquet


2it [00:10,  5.03s/it]

Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T230000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T230000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T060000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T060000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T170000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T170000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T050000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T050000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T090000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T090000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T040000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T040000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T190000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T190000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T020000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T020000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T200000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T200000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T130000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T130000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T220000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T220000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T070000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T070000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T110000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T110000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T180000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T180000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T150000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T150000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T100000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T100000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T010000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T010000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T030000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T030000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T000000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T000000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T080000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T080000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T160000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T160000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T120000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T120000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T140000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T140000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/2025-10-07T210000.parquet to adjuststbuatprocessed/live/processing/dt=2025-10-07/2025-10-07T210000.parquet


3it [12:44, 317.22s/it]

3it [12:44, 254.78s/it]


Folder upload complete.





In [16]:
local_folder_path = f"{root}/output/adjust_live"

# Replace with the desired folder name in the Azure container
azure_folder_name = "live/output"

# Iterate through files in the local folder and upload them
for root, dirs, files in tqdm(os.walk(local_folder_path)):
    for file in files:
        # Construct the full local file path
        local_file_path = os.path.join(root, file)

        # Construct the blob name (path within the Azure container)
        # This preserves the folder structure from the local path
        relative_path = os.path.relpath(local_file_path, local_folder_path)
        blob_name = os.path.join(azure_folder_name, relative_path)
        # print(blob_name)
        # Create a blob client for the current file
        blob_client = container_client.get_blob_client(blob_name)

        print(f"Uploading {local_file_path} to {container_name}/{blob_name}")

#         # Upload the file
        with open(local_file_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)

print("\nFolder upload complete.")

0it [00:00, ?it/s]

0it [00:00, ?it/s]


Folder upload complete.





In [17]:
# Replace with the path to the local folder you want to delete
local_folder_paths = [f"{root}/process/adjust_live",f"{root}/output/adjust_live"]
# local_folder_path = f"data/process/adjust_live"
for local_folder_path in local_folder_paths:
    if os.path.exists(local_folder_path):
        print(f"Deleting local folder: {local_folder_path}")
        shutil.rmtree(local_folder_path)
        print("Local folder deleted.")
    else:
        print(f"Local folder not found: {local_folder_path}")

Local folder not found: /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/process/adjust_live
Local folder not found: /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_live/dt=2025-10-07/output/adjust_live
