In [1]:
import pandas as pd
import numpy as np
import sys
import gc

import os
sys.path.append(os.path.abspath(".."))

In [2]:
import s3fs
from typing import List

from utils.common import *
from config.params import *
from preprocessing.transform import transform, tracking_transforming_input
from preprocessing.intervals import get_interval_from_transformed

In [3]:
from preprocessing.prepare_clustering_data import *

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [4]:
from training.models import *
from training.visualize import *

In [5]:
from pyarrow.dataset import field

In [6]:
import sagemaker
from sagemaker import get_execution_role

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
# get the lastest saved data from mlflow run
import mlflow
from mlflow.tracking import MlflowClient

In [9]:
from sklearn.cluster import KMeans
from datetime import datetime
from pathlib import Path

In [10]:
client = MlflowClient()

In [11]:
# Define session, role, and region so we can
# perform any SageMaker tasks we need
sagemaker_session = sagemaker.Session()
role = get_execution_role()
region = sagemaker_session.boto_region_name

In [12]:
# Provide the ARN of the tracking server that you want to track your training job with
tracking_server_arn = 'arn:aws:sagemaker:ap-southeast-1:771463264346:mlflow-tracking-server/mlflow-RCF-server'

In [13]:
mlflow.set_tracking_uri(tracking_server_arn)

In [14]:
# motor = "DWA"
# motor = "DWB"
# motor = "DWC"
motors

['DWA', 'DWB', 'DWC']

In [15]:
date_folders = [
    "2024-04",
    "2024-05",
    "2024-06",
    "2024-07",
    "2024-08",
    "2024-09",
    "2024-10",
    "2024-11",
    "2024-12",
    # "2025-01",
    # "2025-02",
    # "2025-03"
]

In [16]:
# tag_name = f"{motor}_ACTUAL_MOTOR_SPEED"
# tag_name_digital = f"{motor}_INVERTER_RUNNING"

In [17]:
for motor in motors:
    print(motor)
    tag_name = f"{motor}_ACTUAL_MOTOR_SPEED"
    tag_name_digital = f"{motor}_INVERTER_RUNNING"

    combined_df, lst_training_paths, destination_parquet_folder, filename = prepare_training_data_clustering(motor=motor, date_folders=date_folders, tag_name=tag_name, tag_name_digital=tag_name_digital)

    
    clustered_speed = training_kmean(
        filtered_speed=combined_df,
        tag_name=tag_name,
        motor=motor,
    
        # params to log
        lst_training_paths=lst_training_paths, 
        destination_parquet_folder=destination_parquet_folder, 
        filename=filename,
        
        # model params
        experiment_name = experiment_name_TrainingKMeanClustering
    )
    print("========================")

DWA
✅ Saved to s3://s3-assetcare-bucket/features_store/cluster_datasets/DWA_ACTUAL_MOTOR_SPEED_20250506_030243.parquet
✅ Saved to s3://s3-assetcare-bucket/features_store/cluster_datasets/DWA_ACTUAL_MOTOR_SPEED_20250506_030257.parquet
✅ KMeans clustering + MLflow logging complete for DWA_ACTUAL_MOTOR_SPEED
🏃 View run DWA_ACTUAL_MOTOR_SPEED_5-cluster at: https://ap-southeast-1.experiments.sagemaker.aws/#/experiments/19/runs/783994e1b9304cf8b9eaaf1e6a16e8c9
🧪 View experiment at: https://ap-southeast-1.experiments.sagemaker.aws/#/experiments/19
DWB
✅ Saved to s3://s3-assetcare-bucket/features_store/cluster_datasets/DWB_ACTUAL_MOTOR_SPEED_20250506_030300.parquet
✅ Saved to s3://s3-assetcare-bucket/features_store/cluster_datasets/DWB_ACTUAL_MOTOR_SPEED_20250506_030327.parquet
✅ KMeans clustering + MLflow logging complete for DWB_ACTUAL_MOTOR_SPEED
🏃 View run DWB_ACTUAL_MOTOR_SPEED_5-cluster at: https://ap-southeast-1.experiments.sagemaker.aws/#/experiments/19/runs/997aac8f6ea5461f9dc836e1a79