In [12]:
from dataclasses import dataclass
import numpy as np
import polars as pl
from tqdm import tqdm

MAC_DIR = '/Users/igwanhyeong/PycharmProjects/data_research/raw_data/'

A_dict = {
    'oper_part_no': ['A' for _ in range(8)],
    'demand_dt': [i for i in range(202401, 202409)],
    'demand_qty': [float(i) for i in range(8)],
    'gbm_cd': ['VD' for _ in range(8)]
}

A_pl = pl.DataFrame(A_dict)

B_dict = {
    'oper_part_no': ['B' for _ in range(8)],
    'demand_dt': [i for i in range(202501, 202509)],
    'demand_qty': [float(i) for i in range(8)],
    'gbm_cd': ['VD' for _ in range(8)]
}
B_pl = pl.DataFrame(B_dict)


target_dyn_demand_monthly = pl.read_parquet(MAC_DIR + 'target_dyn_demand_monthly.parquet').with_columns(pl.lit('VD').alias('gbm_cd'))
df = pl.concat([target_dyn_demand_monthly, A_pl, B_pl])

df = df.rename({'oper_part_no': 'part_no', 'demand_dt': 'yyyymm', 'demand_qty': 'qty'})
df = df[:100000]

In [13]:
LOOKBACK = 48
DONOR_K = 5
ALPHA = 7.0         # 유사도 softmax 온도
LAMBDA_GAP = 0.05   # 스케일 갭 페널티

# ---- yyyymm utility ----
def split_yyyymm(yyyymm: int): return yyyymm // 100, yyyymm % 100
def join_yyyymm(y: int, m: int): return y * 100 + m
def add_months_yyyymm(yyyymm: int, k: int) -> int:
    y, m = split_yyyymm(yyyymm); m += k
    y += (m - 1) // 12; m = (m - 1) % 12 + 1
    return join_yyyymm(y, m)
def months_window(plan_yyyymm: int, L: int) -> list[int]:
    return [add_months_yyyymm(plan_yyyymm, -(L - i)) for i in range(1, L + 1)]

In [14]:
def build_window_table(df_bu: pl.DataFrame, plan_yyyymm: int, lookback: int) -> pl.DataFrame:
    win_months = months_window(plan_yyyymm, lookback)
    idx = pl.DataFrame({'yyyymm': win_months})
    parts = df_bu.select('part_no').unique()
    base = parts.join(idx, how = 'cross')   # part_no x 48개월 격자

    # 해당 윈도우만 필터
    bu_win = df_bu.filter(pl.col('yyyymm').is_in(win_months))

    # LEFT JOIN 후 결측 0 유지(실측 존재 여부 구분 위해 qty_raw도 보유)
    j = (base.join(bu_win, on = ['part_no', 'yyyymm'], how = 'left')
             .with_columns([
                pl.col('qty').alias('qty_raw')
                # pl.col('qty').fill_null(None).alias('qty')  # qty는 결측 그대로 두고 처리
            ])
            .sort(['part_no', 'yyyymm'])
         )
    return j

def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    denom = (np.linalg.norm(a) * np.linalg.norm(b))
    return float(a.dot(b) / denom) if denom > 0 else 0.0

def pick_donors(win_df: pl.DataFrame, target_pno: str, donor_k = 5):
    comp = (win_df
             .group_by('part_no').agg(pl.col('qty').is_not_null().sum().alias('n_obs'))
             .filter(pl.col('n_obs') == LOOKBACK)
             .select('part_no')['part_no'].to_list()
            )
    donors = [p for p in comp if p != target_pno]
    return donors[:min(len(donors), donor_k * 5)] # 여유 후보

def analog_transfer_for_one(win_df: pl.DataFrame, target_pno: str, plan_yyyymm: int):
    x = win_df.filter(pl.col("part_no")==target_pno).sort("yyyymm")["qty"].to_numpy()
    mask = ~np.isnan(x)
    if mask.sum() == 0:
        return None, None  # 정보 없음 → skip

    # 유사도 계산
    donors = pick_donors(win_df, target_pno, donor_k=DONOR_K)
    sims = []
    packs = []
    for d_pno in donors:
        d = win_df.filter(pl.col("part_no")==d_pno).sort("yyyymm")["qty"].to_numpy()
        over = mask  # 도너는 48 모두 채워져 있으므로 겹침=mask
        if over.sum() < 3:
            continue
        # 형태 비교(스케일 제거): 중앙값으로 나눠서 정규화
        tx = x[over]; td = d[over]
        mx = np.median(tx); md = np.median(td)
        nx = (tx / (mx if mx!=0 else 1.0)) - np.mean(tx)  # 옵션: 중심화
        nd = (td / (md if md!=0 else 1.0)) - np.mean(td)
        s = cosine_sim(nx, nd)
        sims.append(s)
        packs.append((d_pno, d, mx, md))

    if not sims:
        return None, None

    # 상위 K 도너 선택 + softmax 가중치
    order = np.argsort(sims)[::-1][:DONOR_K]
    sims = [sims[i] for i in order]
    packs = [packs[i] for i in order]
    w = np.exp(np.array(sims)*ALPHA); w = w / w.sum()

    # 스케일 계산(갭 패널티)
    # 타깃 마지막 관측 월과 plan 사이 갭
    ym = win_df.filter(pl.col("part_no")==target_pno).sort("yyyymm")["yyyymm"].to_list()
    last_obs_idx = int(np.where(mask)[0][-1])
    gap = (len(ym)-1 - last_obs_idx)  # plan-1까지 인덱스 기준 경과 개월
    decay = np.exp(-LAMBDA_GAP * gap)

    # 도너별 s_i
    s_list = []
    for (d_pno, d, mx, md) in packs:
        denom = md if md!=0 else 1.0
        s_i = (mx/denom) * decay
        s_list.append(s_i)

    donor_stack = np.vstack([s_list[i]*packs[i][1] for i in range(len(packs))])  # (K, 48)
    y_prior = (w[:,None] * donor_stack).sum(axis=0)  # 전이 곡선

    # 결측만 전이로 채우기
    filled = x.copy()
    filled[~mask] = y_prior[~mask]

    # 간단 가드레일(필요 시 강화)
    # - step caps
    prev = filled[np.where(mask)[0][0]]  # 첫 관측값
    for t in range(1, len(filled)):
        up = prev * 1.2   # 예: +20% cap (운영값에 맞춰 조정)
        dn = prev * 0.6   # 예: -40% cap
        filled[t] = np.clip(filled[t], dn, up)
        prev = filled[t]

    filled_mask = (~mask).astype(int)  # 1이면 전이로 채움
    return filled, filled_mask


def build_targets_for_bu(df_bu: pl.DataFrame, plan_yyyymm: int) -> tuple[pl.DataFrame, pl.DataFrame]:
    win = build_window_table(df_bu, plan_yyyymm, LOOKBACK)

    # 품목별 관측 개수
    stat = (win.group_by("part_no")
               .agg(pl.col("qty").is_not_null().sum().alias("n_obs")))

    # 정상/단기 분리
    normals = stat.filter(pl.col("n_obs")==LOOKBACK)["part_no"].to_list()
    shorts  = stat.filter(pl.col("n_obs")< LOOKBACK)["part_no"].to_list()

    # target_long: 실측만으로 48개월 완비
    target_long = (win.filter(pl.col("part_no").is_in(normals))
                      .select("gbm_cd","part_no","yyyymm","qty")
                      .with_columns(pl.lit(0).alias("filled_mask")))

    # target_short: 도너 전이로 채우기
    out_rows = []
    for pno in tqdm(shorts):
        filled, fmask = analog_transfer_for_one(win, pno, plan_yyyymm)
        if filled is None:
            continue
        ym = win.filter(pl.col("part_no")==pno).sort("yyyymm")["yyyymm"].to_list()
        bu = df_bu.select("gbm_cd").unique().item()  # 단일 BU 프레임이라 가정
        for i in range(LOOKBACK):
            out_rows.append((bu, pno, ym[i], float(filled[i]), int(fmask[i])))

    target_short = pl.DataFrame(out_rows, schema=["gbm_cd","part_no","yyyymm","qty","filled_mask"])
    return target_long, target_short

In [16]:
long, short = build_targets_for_bu(df, plan_yyyymm = 202401)

100%|██████████| 5467/5467 [01:00<00:00, 89.94it/s] 
  target_short = pl.DataFrame(out_rows, schema=["gbm_cd","part_no","yyyymm","qty","filled_mask"])


In [21]:
long

gbm_cd,part_no,yyyymm,qty,filled_mask
str,str,i64,f64,i32
"""VD""","""01517-51632""",202002,16.0,0
"""VD""","""01517-51632""",202003,255.0,0
"""VD""","""01517-51632""",202004,102.0,0
"""VD""","""01517-51632""",202005,125.0,0
"""VD""","""01517-51632""",202006,54.0,0
…,…,…,…,…
"""VD""","""16851-5203-2""",202309,60.0,0
"""VD""","""16851-5203-2""",202310,27.0,0
"""VD""","""16851-5203-2""",202311,14.0,0
"""VD""","""16851-5203-2""",202312,76.0,0


In [25]:
short.filter(pl.col('part_no') == "0001-1001")

gbm_cd,part_no,yyyymm,qty,filled_mask
str,str,i64,f64,i64
"""VD""","""0001-1001""",202002,120.0,0
"""VD""","""0001-1001""",202003,72.0,0
"""VD""","""0001-1001""",202004,43.2,1
"""VD""","""0001-1001""",202005,25.92,1
"""VD""","""0001-1001""",202006,15.552,1
…,…,…,…,…
"""VD""","""0001-1001""",202309,0.990377,0
"""VD""","""0001-1001""",202310,1.188452,1
"""VD""","""0001-1001""",202311,1.426142,1
"""VD""","""0001-1001""",202312,1.711371,1


In [24]:
df.filter(pl.col('part_no') == "0001-1001")

part_no,yyyymm,qty,gbm_cd
str,i64,f64,str
"""0001-1001""",201803,5.0,"""VD"""
"""0001-1001""",201811,7.0,"""VD"""
"""0001-1001""",202002,120.0,"""VD"""
"""0001-1001""",202003,2.0,"""VD"""
"""0001-1001""",202305,2.0,"""VD"""
"""0001-1001""",202306,1.0,"""VD"""
"""0001-1001""",202309,1.0,"""VD"""
"""0001-1001""",202605,2.0,"""VD"""
"""0001-1001""",202606,1.0,"""VD"""
"""0001-1001""",202609,1.0,"""VD"""
