In [None]:
import polars as pl
import torch
from kshape_core_gpu import KShapeClusteringGPU
import sys

MAC_DIR = '/Users/igwanhyeong/PycharmProjects/data_research/data/'
WINDOW_DIR = '/modeling_module/data/'

if sys.platform == 'win32':
    DIR = WINDOW_DIR
    print(torch.cuda.is_available())
    print(torch.cuda.device_count())
    print(torch.version.cuda)
    print(torch.__version__)
    print(torch.cuda.get_device_name(0))
    print(torch.__version__)
else:
    DIR = MAC_DIR

tb_bas_oper_part_mst = (pl.read_parquet(DIR + 'tb_bas_oper_part_mst.parquet')
                        .select(['OPER_PART_NO', 'OPER_PART_NM'])
                        .rename({'OPER_PART_NO': 'oper_part_no', 'OPER_PART_NM': 'oper_part_nm'}))
tb_dyn_fcst_demand_sellout = (pl.read_parquet(DIR + 'tb_dyn_fcst_dmnd_sellout.parquet')
                              .select(['PART_NO', 'DMND_QTY', 'DMND_DT', 'OPER_PART_NO'])
                              .rename({'PART_NO': 'part_no', 'OPER_PART_NO': 'oper_part_no', 'DMND_DT': 'demand_dt', 'DMND_QTY': 'demand_qty'})
                              .select(['part_no', 'oper_part_no', 'demand_dt', 'demand_qty']))

In [None]:
target_df = (tb_dyn_fcst_demand_sellout
    .with_columns(
        (pl.col("demand_dt").cast(pl.Int64) // 100).alias("demand_yyyymm")
    )
    .join(tb_bas_oper_part_mst, on = 'oper_part_no', how = 'left')
    .group_by(['oper_part_no', 'demand_yyyymm'])
    .agg(pl.col('demand_qty').sum().alias('demand_qty'))
    .sort(['oper_part_no', 'demand_yyyymm'])
    .with_columns(pl.col('demand_yyyymm').cast(pl.Utf8).str.strptime(pl.Date, '%Y%m').alias('month'))
)


min_month = target_df.select(pl.col('month').min())[0, 0]
max_month = target_df.select(pl.col('month').max())[0, 0]

full_months = pl.date_range(start = min_month, end = max_month, interval = '1mo', eager = True)
month_df = pl.DataFrame({'month': full_months})
unique_parts = target_df.select(pl.col('oper_part_no').unique())
base = unique_parts.join(month_df, how = 'cross')

aligned_df = (base
                .join(
                    target_df.select(['oper_part_no', 'month', 'demand_qty']),
                    on = ['oper_part_no', 'month'], how = 'left')
                .with_columns(pl.col('demand_qty').fill_null(0.0))
                .pivot(
                    values = 'demand_qty',
                    on = 'month',
                    aggregate_function = 'first'
                )
              )

In [None]:
aligned_df

In [None]:
part_numbers = aligned_df.get_column('oper_part_no')
X_np = aligned_df.select(pl.all().exclude("oper_part_no")).to_numpy()
X_tensor = torch.tensor(X_np, dtype=torch.float32).unsqueeze(-1)  # shape (N, T, 1)

In [None]:
model = KShapeClusteringGPU(n_clusters=5, centroid_init = 'random', max_iter=100)
model.fit(X_tensor)

labels = model.labels_
centroids = model.centroids_

result = pl.DataFrame({
    'oper_part_no': part_numbers,
    'cluster_label': labels.astype(int)
})

result

In [None]:
model.save(DIR + '/fit/20250820_k_shape.pkl')

In [None]:
import matplotlib.pyplot as plt

def plot_kshape_centroids(model, full_months, max_cols=3):
    """
    model: 학습된 KShapeClusteringGPU 모델
    full_months: pivot 시 사용한 월별 컬럼 리스트 (Datetime 또는 문자열 가능)
    """
    k = model.n_clusters
    n_cols = min(k, max_cols)
    n_rows = (k + n_cols - 1) // n_cols

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows), squeeze=False)

    for i, ax in enumerate(axes.flat[:k]):
        centroid = model.centroids_[i].cpu().squeeze().numpy()
        ax.plot(full_months, centroid)
        ax.set_title(f"Cluster {i} Centroid")
        ax.tick_params(axis='x', labelrotation=45)
        ax.grid(True)

    plt.tight_layout()
    plt.show()

plot_kshape_centroids(model, full_months=full_months)


In [None]:
import numpy as np
def plot_cluster_samples(model, pivot_df, cluster_id, full_months, n_samples=5):
    """
    클러스터에 속한 시계열 중 일부 예시를 함께 그림
    """
    part_numbers = pivot_df["oper_part_no"].values
    member_indices = [i for i, label in enumerate(model.labels_) if label == cluster_id]

    if len(member_indices) == 0:
        print(f"No members in cluster {cluster_id}")
        return

    chosen = np.random.choice(member_indices, size=min(n_samples, len(member_indices)), replace=False)

    full_months = [str(col) for col in full_months]
    pivot_df.columns = pivot_df.columns.astype(str)

    plt.figure(figsize=(12, 6))
    for idx in chosen:
        y = pivot_df.loc[idx, full_months].values.astype(float)
        plt.plot(full_months, y, label=f"Part {part_numbers[idx]}")

    plt.title(f"Samples from Cluster {cluster_id}")
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

plot_cluster_samples(model, aligned_df.to_pandas(), cluster_id=0, full_months=full_months)

In [None]:
import seaborn as sns

def plot_cluster_distribution(model):
    sns.countplot(x=model.labels_)
    plt.title("Cluster Distribution")
    plt.xlabel("Cluster ID")
    plt.ylabel("Number of Members")
    plt.show()

plot_cluster_distribution(model)