In [22]:
import pandas as pd
import sys
import gc

import os
sys.path.append(os.path.abspath(".."))

In [23]:
import s3fs
from typing import List

from utils.common import *
from config.params import *
from preprocessing.transform import transform, tracking_transforming_input, filtering_speed_by_digital
from preprocessing.intervals import get_interval_from_transformed, filter_by_intervals

In [24]:
from pyarrow.dataset import field

In [25]:
import sagemaker
from sagemaker import get_execution_role

In [26]:
import mlflow

In [27]:
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
# Define session, role, and region so we can
# perform any SageMaker tasks we need
sagemaker_session = sagemaker.Session()
role = get_execution_role()
region = sagemaker_session.boto_region_name

In [29]:
# Provide the ARN of the tracking server that you want to track your training job with
tracking_server_arn = 'arn:aws:sagemaker:ap-southeast-1:771463264346:mlflow-tracking-server/mlflow-RCF-server'

In [30]:
mlflow.set_tracking_uri(tracking_server_arn)

In [31]:
motors = [
    "DWA",
    "DWB",
    "DWC",
    # "TD"
]    
motors

['DWA', 'DWB', 'DWC']

In [32]:
date_folders

['2024-04',
 '2024-05',
 '2024-06',
 '2024-07',
 '2024-08',
 '2024-09',
 '2024-10',
 '2024-11',
 '2024-12',
 '2025-01',
 '2025-02',
 '2025-03']

In [33]:
date_folders = [
    "2024-04",
    "2024-05",
    "2024-06",
    "2024-07",
    "2024-08",
    "2024-09",
    "2024-10",
    "2024-11",
    "2024-12",
    "2025-01",
    "2025-02",
    "2025-03"
]

In [34]:
for motor in motors:
    print(motor)

    for date_folder in date_folders:
        print(date_folder)
        current_time = get_current_timestamp_string()

        filtering_speed_by_digital(
            motor=motor,
            date_folder=date_folder,
            current_time=current_time
        )
        gc.collect()

DWA
2024-04
This-> s3://s3-assetcare-bucket/features_store/transformed/2025-03/DWA_ACTUAL_MOTOR_SPEED_20250503_163629.parquet
(80749, 4)
✅ Saved to s3://s3-assetcare-bucket/features_store/filtered/2024-04/DWA_ACTUAL_MOTOR_SPEED_20250506_024709.parquet
✅ Phase 2 tracking complete for DWA_ACTUAL_MOTOR_SPEED
🏃 View run Filtering_DWA_ACTUAL_MOTOR_SPEED_by_DWA_INVERTER_RUNNING_2024-04 at: https://ap-southeast-1.experiments.sagemaker.aws/#/experiments/18/runs/9e14d300d9c340eea13d20f41437da90
🧪 View experiment at: https://ap-southeast-1.experiments.sagemaker.aws/#/experiments/18
2024-05
This-> s3://s3-assetcare-bucket/features_store/transformed/2025-03/DWA_ACTUAL_MOTOR_SPEED_20250503_163629.parquet
(80749, 4)
✅ Saved to s3://s3-assetcare-bucket/features_store/filtered/2024-05/DWA_ACTUAL_MOTOR_SPEED_20250506_024713.parquet
✅ Phase 2 tracking complete for DWA_ACTUAL_MOTOR_SPEED
🏃 View run Filtering_DWA_ACTUAL_MOTOR_SPEED_by_DWA_INVERTER_RUNNING_2024-05 at: https://ap-southeast-1.experiments.sag

In [35]:
def get_child_run_param_by_name(
    experiment_name: str,
    parent_run_name: str,
    child_keyword: str,
    param_key: str
):
    try:
        client = MlflowClient()

        # Get experiment ID
        experiment = client.get_experiment_by_name(experiment_name)
        if experiment is None:
            print(f"Experiment '{experiment_name}' not found.")
            return None
        experiment_id = experiment.experiment_id
        
        # Find parent run by name
        parent_runs = client.search_runs(
            experiment_ids=[experiment_id],
            filter_string=f'tags.mlflow.runName = "{parent_run_name}"',
            order_by=["start_time DESC"],
            max_results=1
        )

        if not parent_runs:
            print(f"No parent run with name '{parent_run_name}' found.")
            return None

        parent_run = parent_runs[0]
        parent_run_id = parent_run.info.run_id

        # Find child runs of this parent
        child_runs = client.search_runs(
            experiment_ids=[experiment_id],
            filter_string=f'tags.mlflow.parentRunId = "{parent_run_id}"',
            order_by=["start_time DESC"],
        )
        # print(child_runs)

        # Search for a child run that contains the keyword in the name
        for run in child_runs:
            run_name = run.data.tags.get("mlflow.runName", "")
            if child_keyword in run_name:
                return run.data.params.get(param_key, f"Param '{param_key}' not found.")

        print(f"No child run matching keyword '{child_keyword}' found.")
        return None

    except Exception as e:
        print("Error:", e)
        return None

In [36]:
motor = "DWA"
# tag_name_interval = f"{motor}_INVERTER_RUNNING"
# tag_name_interval = f"Transforming_DWC_INVERTER_RUNNING_2024-04"

# "Transforming_DWA_INVERTER_RUNNING_2024-05"
tag_name_speed = f"Transforming_{motor}_ACTUAL_MOTOR_SPEED_2024-05"

In [37]:
get_child_run_param_by_name(
    experiment_name=experiment_name_RegularInterval,
    parent_run_name=experiment_name_RegularInterval_DigitalInput,
    child_keyword=tag_name_speed,
    param_key="destination_parquet_file"
)


# key_val = get_child_run_param_by_name(
#     experiment_name="1. Regular Interval",
#     parent_run_name="1.1 Digital Input",
#     child_run_name="Transforming_DWA_INVERTER_RUNNING_2024-05",
#     key_param="destination_parquet_file"
# )


No child run matching keyword 'Transforming_DWA_ACTUAL_MOTOR_SPEED_2024-05' found.


In [38]:
experiment_name_RegularInterval

'1. Regular Interval'

In [39]:
experiment_name_RegularInterval_DigitalInput

'1.1 Digital Input'

In [40]:
from mlflow.tracking import MlflowClient

def get_child_run_param_by_name(
    experiment_name: str,
    parent_run_name: str,
    child_run_name: str,
    key_param: str
):
    client = MlflowClient()

    # B1: Lấy ID của experiment
    experiment = client.get_experiment_by_name(experiment_name)
    if experiment is None:
        print(f"[!] Experiment '{experiment_name}' not found.")
        return None
    experiment_id = experiment.experiment_id

    # B2: Lấy parent run ID theo run name
    parent_runs = client.search_runs(
        experiment_ids=[experiment_id],
        filter_string=f'tags.mlflow.runName = "{parent_run_name}"',
        order_by=["start_time DESC"],
        max_results=1
    )
    if not parent_runs:
        print(f"[!] Parent run '{parent_run_name}' not found.")
        return None
    parent_run_id = parent_runs[0].info.run_id

    # B3: Tìm các run con của parent_run
    child_runs = client.search_runs(
        experiment_ids=[experiment_id],
        filter_string=f'tags.mlflow.parentRunId = "{parent_run_id}"'
    )

    # B4: Lọc run con theo tên, rồi lấy param theo key
    for run in child_runs:
        run_name = run.data.tags.get("mlflow.runName")
        if run_name == child_run_name:
            param_value = run.data.params.get(key_param)
            if param_value is not None:
                return param_value
            else:
                print(f"[!] Param '{key_param}' not found in child run '{child_run_name}'.")
                return None

    print(f"[!] Child run '{child_run_name}' not found under parent '{parent_run_name}'")
    return None


In [41]:
key_val = get_child_run_param_by_name(
    experiment_name="1. Regular Interval",
    parent_run_name="1.1 Digital Input",
    child_run_name="Transforming_DWA_INVERTER_RUNNING_2024-05",
    key_param="destination_parquet_file"
)


In [42]:
key_val

's3://s3-assetcare-bucket/features_store/transformed/2024-05/DWA_INVERTER_RUNNING_20250502_054728.parquet'