In [1]:
# Parameters
RUN_DATE = "2025-10-08"


<a href="https://colab.research.google.com/github/HieuNguyenPhi/ADJ_JOBS/blob/main/notebooks/ADJUST_JOB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
from azure.storage.blob import BlobServiceClient

account_name = os.getenv('ACCOUNT_NAME')
account_key = os.getenv('ACCOUNT_KEY')
# Replace with your Azure Storage account name and SAS token or connection string
connect_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
container_list = blob_service_client.list_containers()
container_name = "adjuststbuatprocessed" #os.getenv('CONTAINER_NAME')
container_client = blob_service_client.get_container_client(container_name)
already_processed = [file.name.split('/')[1].split('.')[0] for file in container_client.list_blobs() if file.name.split('/')[0] == 'output']
already_processed[-5:]

['2025-10-03', '2025-10-04', '2025-10-05', '2025-10-06', '2025-10-07']

In [3]:
from datetime import date
import pandas as pd
today = date.today().strftime('%Y-%m-%d')
need_process = pd.date_range(start=already_processed[-2], end=today).strftime('%Y-%m-%d').to_list()
need_process

['2025-10-06', '2025-10-07', '2025-10-08']

In [4]:
container_name_uat = "adjuststbuat"
container_client_uat = blob_service_client.get_container_client(container_name_uat)
from collections import defaultdict
files = [i.name for i in container_client_uat.list_blobs()]
groups = defaultdict(list)
for f in files:
    dt = f.split('_')[1]
    groups[dt].append(f)
groups[dt]

['rsh20bkkb4zk_2025-10-08T030000_762c775ae454d23f2c6b6a75623d14c7_2853a0.csv.gz',
 'rsh20bkkb4zk_2025-10-08T030000_762c775ae454d23f2c6b6a75623d14c7_2853a1.csv.gz',
 'rsh20bkkb4zk_2025-10-08T030000_762c775ae454d23f2c6b6a75623d14c7_be8220.csv.gz',
 'rsh20bkkb4zk_2025-10-08T030000_762c775ae454d23f2c6b6a75623d14c7_be8221.csv.gz',
 'rsh20bkkb4zk_2025-10-08T030000_762c775ae454d23f2c6b6a75623d14c7_c35750.csv.gz',
 'rsh20bkkb4zk_2025-10-08T030000_762c775ae454d23f2c6b6a75623d14c7_c35751.csv.gz']

In [5]:
from pathlib import Path
root = Path.cwd()
process_path = f'{root}/process/adjust_uat'
from tqdm import tqdm
import polars as pl

os.makedirs(process_path, exist_ok=True)

storage_options = {
    "account_name": account_name,
    "account_key":  account_key,
}

for ts, files in tqdm(groups.items()):
    dt = ts[:10]                       # "2025-06-25" -> partition dt=...
    if dt not in need_process:
        continue
    # break
    partition_dir = os.path.join(process_path, f"dt={dt}")
    os.makedirs(partition_dir, exist_ok=True)
    out_file = os.path.join(partition_dir, f"{ts}.parquet")

    dfs = []
    for f in files:
        df = (pl.scan_csv(f"az://adjuststbuat/{f}",                             # eager
                          storage_options=storage_options,
                          has_header=True,
                          null_values=["", "NULL"])       # rỗng → null
                .select(pl.all().cast(pl.Utf8)))          # tất cả cột → string
        dfs.append(df)

    df_all = pl.concat(dfs, how="diagonal")               # tự thêm null cột thiếu
    df_all.sink_parquet(out_file, compression="snappy")
    print(f'Done dt={dt}/{ts}.parquet')


  0%|          | 0/1837 [00:00<?, ?it/s]

 97%|█████████▋| 1786/1837 [00:01<00:00, 1225.52it/s]

Done dt=2025-10-06/2025-10-06T000000.parquet


Done dt=2025-10-06/2025-10-06T010000.parquet


Done dt=2025-10-06/2025-10-06T020000.parquet


Done dt=2025-10-06/2025-10-06T030000.parquet


Done dt=2025-10-06/2025-10-06T040000.parquet


 97%|█████████▋| 1786/1837 [00:19<00:00, 1225.52it/s]

 97%|█████████▋| 1791/1837 [00:22<00:00, 56.76it/s]  

Done dt=2025-10-06/2025-10-06T050000.parquet


 98%|█████████▊| 1792/1837 [00:26<00:01, 44.40it/s]

Done dt=2025-10-06/2025-10-06T060000.parquet


Done dt=2025-10-06/2025-10-06T070000.parquet


Done dt=2025-10-06/2025-10-06T080000.parquet


 98%|█████████▊| 1792/1837 [00:40<00:01, 44.40it/s]

Done dt=2025-10-06/2025-10-06T090000.parquet


 98%|█████████▊| 1796/1837 [00:44<00:02, 19.87it/s]

Done dt=2025-10-06/2025-10-06T100000.parquet


 98%|█████████▊| 1797/1837 [00:47<00:02, 17.09it/s]

Done dt=2025-10-06/2025-10-06T110000.parquet


Done dt=2025-10-06/2025-10-06T120000.parquet


Done dt=2025-10-06/2025-10-06T130000.parquet


Done dt=2025-10-06/2025-10-06T140000.parquet


 98%|█████████▊| 1797/1837 [01:00<00:02, 17.09it/s]

 98%|█████████▊| 1801/1837 [01:00<00:03, 10.01it/s]

Done dt=2025-10-06/2025-10-06T150000.parquet


 98%|█████████▊| 1802/1837 [01:03<00:03,  8.87it/s]

Done dt=2025-10-06/2025-10-06T160000.parquet


Done dt=2025-10-06/2025-10-06T170000.parquet


Done dt=2025-10-06/2025-10-06T180000.parquet


Done dt=2025-10-06/2025-10-06T190000.parquet


Done dt=2025-10-06/2025-10-06T200000.parquet


Done dt=2025-10-06/2025-10-06T210000.parquet


 98%|█████████▊| 1802/1837 [01:20<00:03,  8.87it/s]

 98%|█████████▊| 1808/1837 [01:20<00:06,  4.59it/s]

Done dt=2025-10-06/2025-10-06T220000.parquet


 98%|█████████▊| 1809/1837 [01:23<00:06,  4.14it/s]

Done dt=2025-10-06/2025-10-06T230000.parquet


Done dt=2025-10-07/2025-10-07T000000.parquet


Done dt=2025-10-07/2025-10-07T010000.parquet


Done dt=2025-10-07/2025-10-07T020000.parquet


Done dt=2025-10-07/2025-10-07T030000.parquet


 98%|█████████▊| 1809/1837 [01:40<00:06,  4.14it/s]

 99%|█████████▊| 1814/1837 [01:44<00:10,  2.13it/s]

Done dt=2025-10-07/2025-10-07T040000.parquet


 99%|█████████▉| 1815/1837 [01:46<00:11,  1.96it/s]

Done dt=2025-10-07/2025-10-07T050000.parquet


Done dt=2025-10-07/2025-10-07T060000.parquet


Done dt=2025-10-07/2025-10-07T070000.parquet


 99%|█████████▉| 1815/1837 [02:00<00:11,  1.96it/s]

Done dt=2025-10-07/2025-10-07T080000.parquet


 99%|█████████▉| 1819/1837 [02:04<00:15,  1.18it/s]

Done dt=2025-10-07/2025-10-07T090000.parquet


 99%|█████████▉| 1820/1837 [02:08<00:16,  1.05it/s]

Done dt=2025-10-07/2025-10-07T100000.parquet


Done dt=2025-10-07/2025-10-07T110000.parquet


Done dt=2025-10-07/2025-10-07T120000.parquet


 99%|█████████▉| 1820/1837 [02:20<00:16,  1.05it/s]

 99%|█████████▉| 1823/1837 [02:22<00:19,  1.37s/it]

Done dt=2025-10-07/2025-10-07T130000.parquet


 99%|█████████▉| 1824/1837 [02:26<00:19,  1.52s/it]

Done dt=2025-10-07/2025-10-07T140000.parquet


Done dt=2025-10-07/2025-10-07T150000.parquet


Done dt=2025-10-07/2025-10-07T160000.parquet


Done dt=2025-10-07/2025-10-07T170000.parquet


100%|█████████▉| 1828/1837 [02:38<00:16,  1.85s/it]

Done dt=2025-10-07/2025-10-07T180000.parquet


100%|█████████▉| 1829/1837 [02:41<00:15,  1.93s/it]

Done dt=2025-10-07/2025-10-07T190000.parquet


Done dt=2025-10-07/2025-10-07T200000.parquet


Done dt=2025-10-07/2025-10-07T210000.parquet


100%|█████████▉| 1832/1837 [02:50<00:10,  2.19s/it]

Done dt=2025-10-07/2025-10-07T220000.parquet


Done dt=2025-10-07/2025-10-07T230000.parquet


100%|█████████▉| 1834/1837 [02:56<00:06,  2.33s/it]

Done dt=2025-10-08/2025-10-08T000000.parquet


Done dt=2025-10-08/2025-10-08T010000.parquet


100%|█████████▉| 1836/1837 [03:04<00:02,  2.76s/it]

Done dt=2025-10-08/2025-10-08T020000.parquet


100%|██████████| 1837/1837 [03:09<00:00,  2.97s/it]

100%|██████████| 1837/1837 [03:09<00:00,  9.71it/s]

Done dt=2025-10-08/2025-10-08T030000.parquet





In [6]:
local_folder_path = f"{root}/process/adjust_uat"

# Replace with the desired folder name in the Azure container
azure_folder_name = "processing"

# Iterate through files in the local folder and upload them
for root, dirs, files in tqdm(os.walk(local_folder_path)):
    for file in files:
        # Construct the full local file path
        local_file_path = os.path.join(root, file)

        # Construct the blob name (path within the Azure container)
        # This preserves the folder structure from the local path
        relative_path = os.path.relpath(local_file_path, local_folder_path)
        blob_name = os.path.join(azure_folder_name, relative_path)
        # print(blob_name)
        # Create a blob client for the current file
        blob_client = container_client.get_blob_client(blob_name)

        print(f"Uploading {local_file_path} to {container_name}/{blob_name}")

#         # Upload the file
        with open(local_file_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)

print("\nFolder upload complete.")

0it [00:00, ?it/s]

Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T090000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T090000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T200000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T200000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T110000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T110000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T190000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T190000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T100000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T100000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T060000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T060000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T120000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T120000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T230000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T230000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T170000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T170000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T070000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T070000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T050000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T050000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T210000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T210000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T160000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T160000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T000000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T000000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T010000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T010000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T080000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T080000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T040000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T040000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T020000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T020000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T180000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T180000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T030000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T030000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T130000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T130000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T220000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T220000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T150000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T150000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-06/2025-10-06T140000.parquet to adjuststbuatprocessed/processing/dt=2025-10-06/2025-10-06T140000.parquet


2it [00:07,  3.75s/it]

Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-08/2025-10-08T030000.parquet to adjuststbuatprocessed/processing/dt=2025-10-08/2025-10-08T030000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-08/2025-10-08T020000.parquet to adjuststbuatprocessed/processing/dt=2025-10-08/2025-10-08T020000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-08/2025-10-08T000000.parquet to adjuststbuatprocessed/processing/dt=2025-10-08/2025-10-08T000000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-08/2025-10-08T010000.parquet to adjuststbuatprocessed/processing/dt=2025-10-08/2025-10-08T010000.parquet


3it [00:08,  2.61s/it]

Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T230000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T230000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T060000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T060000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T170000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T170000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T050000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T050000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T090000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T090000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T040000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T040000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T190000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T190000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T020000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T020000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T200000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T200000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T130000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T130000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T220000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T220000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T070000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T070000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T110000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T110000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T180000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T180000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T150000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T150000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T100000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T100000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T010000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T010000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T030000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T030000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T000000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T000000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T080000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T080000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T160000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T160000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T120000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T120000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T140000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T140000.parquet


Uploading /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/2025-10-07T210000.parquet to adjuststbuatprocessed/processing/dt=2025-10-07/2025-10-07T210000.parquet


4it [00:14,  3.96s/it]

4it [00:14,  3.69s/it]


Folder upload complete.





In [7]:
import glob
for dt_folder in tqdm(glob.glob(os.path.join(process_path, "dt=*"))):
  dt = os.path.basename(dt_folder)[3:]
  df = pl.scan_parquet(
      f"az://adjuststbuatprocessed/live/processing/dt={dt}/*.parquet",
      storage_options=storage_options,
      glob=True,                 # rất quan trọng để expand '*'
  ).select(pl.all().cast(pl.Utf8)).with_columns(pl.lit(dt).alias("dt"))
  df.sink_parquet(
      f"az://adjuststbuatprocessed/live/output/dt={dt}.parquet",
      storage_options=storage_options,
      compression="snappy",
  )
# output_path = f'{root}/output/adjust_uat'
# os.makedirs(output_path, exist_ok=True)
# import glob
# import shutil
# for dt_folder in tqdm(glob.glob(os.path.join(process_path, "dt=*"))):
#     dt = os.path.basename(dt_folder)[3:]                 # "2025-06-25"
#     files_pq = glob.glob(os.path.join(dt_folder, "*T*.parquet"))
#     if not files_pq:
#         continue

#     out_path = os.path.join(output_path, f"{dt}.parquet")

#     # Nếu trước đó lỡ tạo cùng tên dưới dạng DIR → xoá
#     if os.path.isdir(out_path):
#         shutil.rmtree(out_path)

#     # ---------- ❶  Lazy scan tất cả Parquet ----------
#     lfs = [pl.scan_parquet(f) for f in files_pq]          # mỗi file → LazyFrame

#     # ---------- ❷  Concat diagonal + giữ schema linh hoạt ----------
#     lf_day = (
#         pl.concat(lfs, how="diagonal")                    # tự thêm null cột thiếu
#         .select(pl.all().cast(pl.Utf8))                   # đảm bảo mọi cột = string
#         .with_columns(pl.lit(dt).alias("dt"))             # thêm cột partition (tuỳ)
#     )

#     # ---------- ❸  Ghi duy nhất 1 Parquet ----------
#     lf_day.sink_parquet(out_path, compression="snappy")

  0%|          | 0/3 [00:00<?, ?it/s]

 33%|███▎      | 1/3 [01:21<02:42, 81.35s/it]

 67%|██████▋   | 2/3 [02:14<01:04, 64.83s/it]

100%|██████████| 3/3 [03:19<00:00, 65.02s/it]

100%|██████████| 3/3 [03:19<00:00, 66.62s/it]




In [8]:

# local_folder_path = f"{root}/output/adjust_uat"

# # Replace with the desired folder name in the Azure container
# azure_folder_name = "output"

# # Iterate through files in the local folder and upload them
# for root, dirs, files in tqdm(os.walk(local_folder_path)):
#     for file in files:
#         # Construct the full local file path
#         local_file_path = os.path.join(root, file)

#         # Construct the blob name (path within the Azure container)
#         # This preserves the folder structure from the local path
#         relative_path = os.path.relpath(local_file_path, local_folder_path)
#         blob_name = os.path.join(azure_folder_name, relative_path)
#         # print(blob_name)
#         # Create a blob client for the current file
#         blob_client = container_client.get_blob_client(blob_name)

#         print(f"Uploading {local_file_path} to {container_name}/{blob_name}")

# #         # Upload the file
#         with open(local_file_path, "rb") as data:
#             blob_client.upload_blob(data, overwrite=True)

# print("\nFolder upload complete.")

In [9]:
# Replace with the path to the local folder you want to delete
local_folder_paths = [f"{root}/process/adjust_uat",f"{root}/output/adjust_uat"]
# local_folder_path = f"data/process/adjust_live"
for local_folder_path in local_folder_paths:
    if os.path.exists(local_folder_path):
        print(f"Deleting local folder: {local_folder_path}")
        shutil.rmtree(local_folder_path)
        print("Local folder deleted.")
    else:
        print(f"Local folder not found: {local_folder_path}")

Local folder not found: /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/process/adjust_uat
Local folder not found: /home/runner/work/ADJ_JOBS/ADJ_JOBS/process/adjust_uat/dt=2025-10-07/output/adjust_uat


# Live

In [10]:
# already_processed = [file.name.split('/')[-1].split('.')[0] for file in container_client.list_blobs() if file.name[:12] == 'live/output/']
# already_processed[-5:]

In [11]:
# need_process = pd.date_range(start=already_processed[-1], end=today).strftime('%Y-%m-%d').to_list()
# need_process

In [12]:
# container_name_uat = "adjuststblive"
# container_client_uat = blob_service_client.get_container_client(container_name_uat)
# from collections import defaultdict
# files = [i.name for i in container_client_uat.list_blobs()]
# groups = defaultdict(list)
# for f in files:
#     dt = f.split('_')[1]
#     groups[dt].append(f)
# groups[dt]

In [13]:
# process_path = f'{root}/process/adjust_live'
# os.makedirs(process_path, exist_ok=True)

# for ts, files in tqdm(groups.items()):
#     # if ts not in need_process:
#     #     continue
#     dt = ts[:10]                       # "2025-06-25" -> partition dt=...
#     # print(ts)
#     # break
#     if dt not in need_process:
#         continue
#     # break
#     partition_dir = os.path.join(process_path, f"dt={dt}")
#     os.makedirs(partition_dir, exist_ok=True)
#     out_file = os.path.join(partition_dir, f"{ts}.parquet")

#     dfs = []
#     for f in files:
#         df = (pl.scan_csv(f"az://adjuststblive/{f}",                             # eager
#                           storage_options=storage_options,
#                           has_header=True,
#                           null_values=["", "NULL"],
#                           ignore_errors=True)       # rỗng → null
#                 .select(pl.all().cast(pl.Utf8)))          # tất cả cột → string
#         dfs.append(df)

#     df_all = pl.concat(dfs, how="diagonal")               # tự thêm null cột thiếu
#     df_all.sink_parquet(out_file, compression="snappy")
#     print(f'Done dt={dt}/{ts}.parquet')

In [14]:
# output_path = f'{root}/output/adjust_live'
# os.makedirs(output_path, exist_ok=True)
# for dt_folder in tqdm(glob.glob(os.path.join(process_path, "dt=*"))):
#     dt = os.path.basename(dt_folder)[3:]                 # "2025-06-25"
#     files_pq = glob.glob(os.path.join(dt_folder, "*T*.parquet"))
#     if not files_pq:
#         continue

#     out_path = os.path.join(output_path, f"{dt}.parquet")

#     # Nếu trước đó lỡ tạo cùng tên dưới dạng DIR → xoá
#     if os.path.isdir(out_path):
#         shutil.rmtree(out_path)

#     # ---------- ❶  Lazy scan tất cả Parquet ----------
#     lfs = [pl.scan_parquet(f) for f in files_pq]          # mỗi file → LazyFrame

#     # ---------- ❷  Concat diagonal + giữ schema linh hoạt ----------
#     lf_day = (
#         pl.concat(lfs, how="diagonal")                    # tự thêm null cột thiếu
#         .select(pl.all().cast(pl.Utf8))                   # đảm bảo mọi cột = string
#         .with_columns(pl.lit(dt).alias("dt"))             # thêm cột partition (tuỳ)
#     )

#     # ---------- ❸  Ghi duy nhất 1 Parquet ----------
#     lf_day.sink_parquet(out_path, compression="snappy")

In [15]:

# local_folder_path = f"{root}/process/adjust_live"

# # Replace with the desired folder name in the Azure container
# azure_folder_name = "live/processing"

# # Iterate through files in the local folder and upload them
# for root, dirs, files in tqdm(os.walk(local_folder_path)):
#     for file in files:
#         # Construct the full local file path
#         local_file_path = os.path.join(root, file)

#         # Construct the blob name (path within the Azure container)
#         # This preserves the folder structure from the local path
#         relative_path = os.path.relpath(local_file_path, local_folder_path)
#         blob_name = os.path.join(azure_folder_name, relative_path)
#         # print(blob_name)
#         # Create a blob client for the current file
#         blob_client = container_client.get_blob_client(blob_name)

#         print(f"Uploading {local_file_path} to {container_name}/{blob_name}")

# #         # Upload the file
#         with open(local_file_path, "rb") as data:
#             blob_client.upload_blob(data, overwrite=True)

# print("\nFolder upload complete.")

In [16]:
# local_folder_path = f"{root}/output/adjust_live"

# # Replace with the desired folder name in the Azure container
# azure_folder_name = "live/output"

# # Iterate through files in the local folder and upload them
# for root, dirs, files in tqdm(os.walk(local_folder_path)):
#     for file in files:
#         # Construct the full local file path
#         local_file_path = os.path.join(root, file)

#         # Construct the blob name (path within the Azure container)
#         # This preserves the folder structure from the local path
#         relative_path = os.path.relpath(local_file_path, local_folder_path)
#         blob_name = os.path.join(azure_folder_name, relative_path)
#         # print(blob_name)
#         # Create a blob client for the current file
#         blob_client = container_client.get_blob_client(blob_name)

#         print(f"Uploading {local_file_path} to {container_name}/{blob_name}")

# #         # Upload the file
#         with open(local_file_path, "rb") as data:
#             blob_client.upload_blob(data, overwrite=True)

# print("\nFolder upload complete.")

In [17]:
# # Replace with the path to the local folder you want to delete
# local_folder_paths = [f"{root}/process/adjust_live",f"{root}/output/adjust_live"]
# # local_folder_path = f"data/process/adjust_live"
# for local_folder_path in local_folder_paths:
#     if os.path.exists(local_folder_path):
#         print(f"Deleting local folder: {local_folder_path}")
#         shutil.rmtree(local_folder_path)
#         print("Local folder deleted.")
#     else:
#         print(f"Local folder not found: {local_folder_path}")