<a href="https://colab.research.google.com/github/Heoyuna0819/machine_learning/blob/main/Mhealth_CNN%2BLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd

# 데이터셋 경로 설정
DATA_DIR = "/content/drive/MyDrive/MHEALTHDATASET"

# UCI MHEALTH dataset 컬럼 정의
COLS = [
    "chest_acc_x","chest_acc_y","chest_acc_z",
    "ecg_1","ecg_2",
    "ankle_acc_x","ankle_acc_y","ankle_acc_z",
    "ankle_gyro_x","ankle_gyro_y","ankle_gyro_z",
    "ankle_mag_x","ankle_mag_y","ankle_mag_z",
    "arm_acc_x","arm_acc_y","arm_acc_z",
    "arm_gyro_x","arm_gyro_y","arm_gyro_z",
    "arm_mag_x","arm_mag_y","arm_mag_z",
    "label"
]

# 모든 subject 데이터 읽어서 합치기
dfs = []
for sid in range(1, 11):  # subject 1~10
    file_path = os.path.join(DATA_DIR, f"mHealth_subject{sid}.log")
    df = pd.read_csv(file_path, sep="\t", header=None, names=COLS)
    df["subject"] = sid   # subject 번호 추가
    dfs.append(df)

# 하나의 DataFrame으로 합치기
full_df = pd.concat(dfs, ignore_index=True)

# Null 클래스(라벨=0)는 제거
full_df = full_df[full_df["label"] != 0].reset_index(drop=True)

print(full_df.shape)
print(full_df.head())


(343195, 25)
   chest_acc_x  chest_acc_y  chest_acc_z     ecg_1     ecg_2  ankle_acc_x  \
0      -9.7788      0.55690      1.19750  0.008373 -0.033490       2.6493   
1      -9.7733      0.27880      0.73036 -0.025118 -0.025118       2.4157   
2      -9.8609      0.11561      0.79988  0.025118  0.016745       2.3865   
3      -9.7409      0.17652      0.88957  0.180010  0.129770       2.3758   
4      -9.7821      0.21637      0.90368  0.092098  0.046049       2.3239   

   ankle_acc_y  ankle_acc_z  ankle_gyro_x  ankle_gyro_y  ...  arm_acc_y  \
0      -9.4517      0.37683      -0.20965      -0.88931  ...    -9.0618   
1      -9.5306      0.40179      -0.20965      -0.88931  ...    -9.2048   
2      -9.5991      0.48141      -0.20037      -0.86867  ...    -9.1945   
3      -9.5997      0.42919      -0.20037      -0.86867  ...    -9.1746   
4      -9.5406      0.40038      -0.20037      -0.86867  ...    -9.2039   

   arm_acc_z  arm_gyro_x  arm_gyro_y  arm_gyro_z  arm_mag_x  arm_mag_y  \

In [3]:
import numpy as np

# 윈도우 크기 & stride
FS = 50          # 주파수 50Hz
WIN = 2 * FS     # 2초 = 100 샘플
STRIDE = WIN // 2  # 절반 겹치기 = 50 샘플

# feature 컬럼 (라벨과 subject 제외)
FEATURE_COLS = [c for c in full_df.columns if c not in ["label", "subject"]]

def make_windows_by_subject(df, min_ratio=0.0):
    Xs, ys, subs = [], [], []

    # subject별로 안전하게 자르기
    for sid, part in df.groupby("subject", sort=True):
        arr = part[FEATURE_COLS].values.astype(np.float32)  # (N_s, 23)
        labels = part["label"].values.astype(np.int32)      # (N_s,)
        n = len(part)
        i = 0
        while i + WIN <= n:
            w_labels = labels[i:i+WIN]
            w_vals = arr[i:i+WIN]

            # 최빈값(label)과 비율
            binc = np.bincount(w_labels)
            maj = np.argmax(binc)
            maj_ratio = binc[maj] / WIN

            if maj_ratio >= min_ratio:
                Xs.append(w_vals)
                ys.append(maj)
                subs.append(sid)

            i += STRIDE

    X = np.array(Xs)             # (num_windows, 100, 23)
    y = np.array(ys, dtype=int)  # (num_windows,)
    s = np.array(subs, dtype=int)
    return X, y, s

X_all, y_all, s_all = make_windows_by_subject(full_df, min_ratio=0.0)

print("X_all shape:", X_all.shape)
print("y_all shape:", y_all.shape)
print("subject unique:", np.unique(s_all))
# 라벨 1~12 분포
counts = np.bincount(y_all)
print("라벨 분포(1~12):", counts[1:13])

X_all shape: (6849, 100, 23)
y_all shape: (6849,)
subject unique: [ 1  2  3  4  5  6  7  8  9 10]
라벨 분포(1~12): [610 615 615 614 608 567 587 587 614 614 616 202]


In [4]:
import numpy as np

# 5개 그룹
pairs = [(1,2), (3,4), (5,6), (7,8), (9,10)]

def split_by_subject(X, y, s, test_pair):
    """
    X: 윈도우 입력 데이터 (shape: [N, T, D])
    y: 라벨 (shape: [N])
    s: subject 번호 (shape: [N])
    test_pair: (예: (1,2)) 테스트 subject 번호 쌍
    """
    test_mask = np.isin(s, test_pair)   # 테스트셋에 해당하는 subject만 True
    X_train, y_train = X[~test_mask], y[~test_mask]
    X_test,  y_test  = X[test_mask],  y[test_mask]
    return X_train, y_train, X_test, y_test

# 다섯 fold 순환
for i, pair in enumerate(pairs, 1):
    X_tr, y_tr, X_te, y_te = split_by_subject(X_all, y_all, s_all, pair)
    print(f"Fold {i} | Test subjects={pair}")
    print("  Train set:", X_tr.shape, y_tr.shape)
    print("  Test set :", X_te.shape, y_te.shape)

Fold 1 | Test subjects=(1, 2)
  Train set: (5438, 100, 23) (5438,)
  Test set : (1411, 100, 23) (1411,)
Fold 2 | Test subjects=(3, 4)
  Train set: (5438, 100, 23) (5438,)
  Test set : (1411, 100, 23) (1411,)
Fold 3 | Test subjects=(5, 6)
  Train set: (5529, 100, 23) (5529,)
  Test set : (1320, 100, 23) (1320,)
Fold 4 | Test subjects=(7, 8)
  Train set: (5500, 100, 23) (5500,)
  Test set : (1349, 100, 23) (1349,)
Fold 5 | Test subjects=(9, 10)
  Train set: (5491, 100, 23) (5491,)
  Test set : (1358, 100, 23) (1358,)


In [5]:
import numpy as np

# 5개 그룹
pairs = [(1,2), (3,4), (5,6), (7,8), (9,10)]

def split_by_subject(X, y, s, test_pair):
    test_mask = np.isin(s, test_pair)
    X_train, y_train = X[~test_mask], y[~test_mask]
    X_test,  y_test  = X[test_mask],  y[test_mask]
    return X_train, y_train, X_test, y_test

def standardize(train_X, test_X):

    # 훈련 데이터 기준으로 mean/std 계산
    mean = train_X.mean(axis=(0,1), keepdims=True)
    std = train_X.std(axis=(0,1), keepdims=True)
    std[std == 0] = 1.0

    # 표준화 적용
    train_X = (train_X - mean) / std
    test_X  = (test_X  - mean) / std
    return train_X, test_X, mean, std

# 5-Fold 교차검증용 루프
for i, pair in enumerate(pairs, 1):
    X_tr, y_tr, X_te, y_te = split_by_subject(X_all, y_all, s_all, pair)
    X_tr, X_te, mean, std = standardize(X_tr, X_te)

    print(f"Fold {i} | Test subjects={pair}")
    print("  Train set:", X_tr.shape, y_tr.shape)
    print("  Test set :", X_te.shape, y_te.shape)
    print("  평균:", np.round(mean.mean(), 3), " | 표준편차:", np.round(std.mean(), 3))
    print("-" * 50)

Fold 1 | Test subjects=(1, 2)
  Train set: (5438, 100, 23) (5438,)
  Test set : (1411, 100, 23) (1411,)
  평균: -1.097  | 표준편차: 14.805
--------------------------------------------------
Fold 2 | Test subjects=(3, 4)
  Train set: (5438, 100, 23) (5438,)
  Test set : (1411, 100, 23) (1411,)
  평균: -1.105  | 표준편차: 14.749
--------------------------------------------------
Fold 3 | Test subjects=(5, 6)
  Train set: (5529, 100, 23) (5529,)
  Test set : (1320, 100, 23) (1320,)
  평균: -1.076  | 표준편차: 14.514
--------------------------------------------------
Fold 4 | Test subjects=(7, 8)
  Train set: (5500, 100, 23) (5500,)
  Test set : (1349, 100, 23) (1349,)
  평균: -1.137  | 표준편차: 14.941
--------------------------------------------------
Fold 5 | Test subjects=(9, 10)
  Train set: (5491, 100, 23) (5491,)
  Test set : (1358, 100, 23) (1358,)
  평균: -1.045  | 표준편차: 14.942
--------------------------------------------------


In [27]:
def set_seed(seed=42):
    return np.random.RandomState(seed)

def one_hot(y, num_classes):
    oh = np.zeros((len(y), num_classes), np.float32)
    oh[np.arange(len(y)), y-1] = 1
    return oh

def softmax(x):
    x = x - x.max(axis=1, keepdims=True)
    e = np.exp(x)
    return e / (e.sum(axis=1, keepdims=True)+1e-12)

def cross_entropy(p, yoh):
    return -np.mean(np.sum(yoh*np.log(p+1e-12), axis=1))

def macro_f1(yt, yp, K=12):
    f1s=[]
    for c in range(1,K+1):
        tp = np.sum((yp==c)&(yt==c))
        fp = np.sum((yp==c)&(yt!=c))
        fn = np.sum((yp!=c)&(yt==c))
        p = tp/(tp+fp+1e-12)
        r = tp/(tp+fn+1e-12)
        f1 = 2*p*r/(p+r+1e-12)
        f1s.append(f1)
    return float(np.mean(f1s))

class Conv1D:
    def __init__(self, k, c_in, c_out, rng):
        lim = np.sqrt(6/(k*c_in+c_out))
        self.W = rng.uniform(-lim, lim, (k,c_in,c_out)).astype(np.float32)
        self.b = np.zeros((c_out,), np.float32)
        self.mW=np.zeros_like(self.W); self.vW=np.zeros_like(self.W)
        self.mb=np.zeros_like(self.b); self.vb=np.zeros_like(self.b)

        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)

    def forward(self, x):
        self.x = x
        N,T,C = x.shape
        k,_,Cout = self.W.shape
        To = T-k+1
        out = np.zeros((N,To,Cout), np.float32)
        for t in range(To):
            out[:,t,:] = np.tensordot(x[:,t:t+k,:], self.W, axes=([1,2],[0,1])) + self.b
        return out

    def backward(self, dout):
        x = self.x
        N,T,C = x.shape
        k,_,Cout = self.W.shape
        To = T-k+1

        dW = np.zeros_like(self.W)
        db = dout.sum((0,1))
        dx = np.zeros_like(x)

        for t in range(To):
            xs = x[:, t:t+k, :]
            dW += np.tensordot(xs, dout[:,t,:], axes=([0],[0]))
            dx[:, t:t+k, :] += np.tensordot(dout[:,t,:], self.W, axes=([1],[2]))

        self.dW, self.db = dW, db
        return dx

class ReLU:
    def forward(self, x):
        self.mask = (x>0)
        return x*self.mask
    def backward(self, dout):
        return dout*self.mask

class MaxPool1D:
    def __init__(self, pool=2, stride=2):
        self.pool=pool
        self.stride=stride

    def forward(self,x):
        self.x=x
        N,T,C = x.shape
        To = 1+(T-self.pool)//self.stride
        out = np.zeros((N,To,C),np.float32)
        self.argmax = np.zeros((N,To,C),np.int32)
        for t in range(To):
            s=t*self.stride
            w=x[:,s:s+self.pool,:]
            idx = np.argmax(w,axis=1)
            out[:,t,:]=w[np.arange(N)[:,None], idx, np.arange(C)]
            self.argmax[:,t,:]=s+idx
        return out

    def backward(self,dout):
        x=self.x
        N,T,C=x.shape
        _,To,_=dout.shape
        dx=np.zeros_like(x)
        for t in range(To):
            idx=self.argmax[:,t,:]
            for n in range(N):
                dx[n, idx[n,:], np.arange(C)] += dout[n,t,:]
        return dx

class Dense:
    def __init__(self, in_dim, out_dim, rng):
        lim=np.sqrt(6/(in_dim+out_dim))
        self.W=rng.uniform(-lim,lim,(in_dim,out_dim)).astype(np.float32)
        self.b=np.zeros((out_dim,),np.float32)

        self.mW=np.zeros_like(self.W); self.vW=np.zeros_like(self.W)
        self.mb=np.zeros_like(self.b); self.vb=np.zeros_like(self.b)

        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)

    def forward(self,x):
        self.x=x
        return x@self.W+self.b

    def backward(self,dout):
        self.dW=self.x.T@dout
        self.db=dout.sum(0)
        return dout@self.W.T


In [32]:
# 1. LSTM

class LSTM:
    def __init__(self, D, H, rng):
        self.D = D
        self.H = H
        lim = np.sqrt(1.0 / (D + H))

        self.Wx = rng.uniform(-lim, lim, (D, 4*H)).astype(np.float32)
        self.Wh = rng.uniform(-lim, lim, (H, 4*H)).astype(np.float32)
        self.b  = np.zeros((4*H,), np.float32)

        # Adam 용 변수
        self.mWx = np.zeros_like(self.Wx); self.vWx = np.zeros_like(self.Wx)
        self.mWh = np.zeros_like(self.Wh); self.vWh = np.zeros_like(self.Wh)
        self.mb  = np.zeros_like(self.b);  self.vb  = np.zeros_like(self.b)

        # gradient 버퍼
        self.dWx = np.zeros_like(self.Wx)
        self.dWh = np.zeros_like(self.Wh)
        self.db  = np.zeros_like(self.b)

    def forward(self, X):
        """
        X: (N, T, D)
        return hs: (N, T, H)
        """
        N, T, D = X.shape
        H = self.H

        self.x_list = []
        self.i_list = []
        self.f_list = []
        self.o_list = []
        self.g_list = []
        self.c_list = []
        self.h_list = []

        h = np.zeros((N, H), np.float32)
        c = np.zeros((N, H), np.float32)

        self.h_list.append(h.copy())
        self.c_list.append(c.copy())

        hs = np.zeros((N, T, H), np.float32)

        for t in range(T):
            x_t = X[:, t, :]
            self.x_list.append(x_t)

            a = x_t @ self.Wx + h @ self.Wh + self.b  # (N,4H)
            i = 1.0 / (1.0 + np.exp(-a[:, :H]))
            f = 1.0 / (1.0 + np.exp(-a[:, H:2*H]))
            o = 1.0 / (1.0 + np.exp(-a[:, 2*H:3*H]))
            g = np.tanh(a[:, 3*H:])

            c = f * c + i * g
            h = o * np.tanh(c)

            self.i_list.append(i)
            self.f_list.append(f)
            self.o_list.append(o)
            self.g_list.append(g)
            self.c_list.append(c.copy())
            self.h_list.append(h.copy())

            hs[:, t, :] = h

        return hs

    def backward(self, dhs):
        """
        dhs: (N, T, H)  ← 모든 시점의 hidden에 대한 gradient
        return dx: (N, T, D)
        """
        N, T, H = dhs.shape
        D = self.D

        dWx = np.zeros_like(self.Wx)
        dWh = np.zeros_like(self.Wh)
        db  = np.zeros_like(self.b)
        dx  = np.zeros((N, T, D), np.float32)

        dh_next = np.zeros((N, H), np.float32)
        dc_next = np.zeros((N, H), np.float32)

        for t in reversed(range(T)):
            # 이번 타임스텝에서 내려온 grad + 다음 스텝에서 온 grad
            dh = dhs[:, t, :] + dh_next

            x_t = self.x_list[t]
            i = self.i_list[t]
            f = self.f_list[t]
            o = self.o_list[t]
            g = self.g_list[t]
            c_t = self.c_list[t+1]
            c_prev = self.c_list[t]
            h_prev = self.h_list[t]

            tanh_c = np.tanh(c_t)

            do = dh * tanh_c
            dc = dh * o * (1.0 - tanh_c**2) + dc_next

            df = dc * c_prev
            di = dc * g
            dg = dc * i
            dc_prev = dc * f

            di_in = di * i * (1.0 - i)
            df_in = df * f * (1.0 - f)
            do_in = do * o * (1.0 - o)
            dg_in = dg * (1.0 - g**2)

            da = np.concatenate([di_in, df_in, do_in, dg_in], axis=1)  # (N,4H)

            dWx += x_t.T @ da
            dWh += h_prev.T @ da
            db  += da.sum(axis=0)

            dx[:, t, :] = da @ self.Wx.T
            dh_prev     = da @ self.Wh.T

            dh_next = dh_prev
            dc_next = dc_prev

        self.dWx, self.dWh, self.db = dWx, dWh, db
        return dx

# 2. BiLSTM (정방향 + 역방향)
class BiLSTM:
    def __init__(self, D, H, rng):
        self.fwd = LSTM(D, H, rng)
        self.bwd = LSTM(D, H, rng)
        self.H = H

    def forward(self, X):
        """
        X: (N,T,D)
        return hs_bi: (N,T,2H)
        """
        hs_f = self.fwd.forward(X)               # (N,T,H)
        hs_b = self.bwd.forward(X[:, ::-1, :])   # (N,T,H)
        hs_b = hs_b[:, ::-1, :]                  # 다시 원래 순서로
        return np.concatenate([hs_f, hs_b], axis=2)  # (N,T,2H)

    def backward(self, dhs_bi):
        """
        dhs_bi: (N,T,2H)
        return dx: (N,T,D)
        """
        N, T, HH = dhs_bi.shape
        H = HH // 2

        dhs_f = dhs_bi[:, :, :H]
        dhs_b = dhs_bi[:, :, H:]

        # 역방향 LSTM은 시퀀스 뒤집어서 전달
        dhs_b_rev = dhs_b[:, ::-1, :]
        dx_f = self.fwd.backward(dhs_f)          # (N,T,D)
        dx_b_rev = self.bwd.backward(dhs_b_rev)  # (N,T,D)
        dx_b = dx_b_rev[:, ::-1, :]

        return dx_f + dx_b



# 3. CNN + BiLSTM + Attention 모델
class CNN_BiLSTM_Att_Stable:
    def __init__(self, T, D, H=96, num_classes=12, rng=None):
        self.rng = set_seed(42) if rng is None else rng

        # ---- CNN 부분 (얕게 1개 + 풀링) ----
        self.conv1 = Conv1D(k=5, c_in=D, c_out=64, rng=self.rng)
        self.relu1 = ReLU()
        self.pool1 = MaxPool1D(pool=2, stride=2)   # 100 -> 96 -> 48

        # ---- BiLSTM ----
        self.bilstm = BiLSTM(D=64, H=H, rng=self.rng)  # 출력: (N,T', 2H)

        # ---- Attention (time-wise) ----
        self.att_w = self.rng.uniform(-0.1, 0.1, (2*H, 1)).astype(np.float32)
        self.m_att = np.zeros_like(self.att_w)
        self.v_att = np.zeros_like(self.att_w)
        self.d_att_w = np.zeros_like(self.att_w)

        # ---- FC ----
        self.fc1 = Dense(in_dim=2*H, out_dim=64, rng=self.rng)
        self.relu_fc = ReLU()
        self.fc2 = Dense(in_dim=64, out_dim=num_classes, rng=self.rng)

        # Adam 하이퍼파라미터
        self.t = 0
        self.lr = 5e-4
        self.b1 = 0.9
        self.b2 = 0.999
        self.eps = 1e-8
        self.clip = 3.0

    # ----- Adam helper -----
    def _adam(self, param, grad, m, v):
        self.t += 1
        m[:] = self.b1 * m + (1 - self.b1) * grad
        v[:] = self.b2 * v + (1 - self.b2) * (grad * grad)
        mhat = m / (1 - self.b1**self.t)
        vhat = v / (1 - self.b2**self.t)
        param -= self.lr * mhat / (np.sqrt(vhat) + self.eps)

    # ----- Forward -----
    def forward(self, X):
        """
        X: (N,T,D)
        return logits: (N, num_classes)
        """
        # CNN
        z = self.conv1.forward(X)
        z = self.relu1.forward(z)
        z = self.pool1.forward(z)        # (N, T', 64)

        # BiLSTM
        hs = self.bilstm.forward(z)      # (N, T', 2H)
        self.hs = hs

        # Attention
        # score: (N, T', 1)
        score = hs @ self.att_w          # (N,T',1)
        score = score - score.max(axis=1, keepdims=True)
        exp_s = np.exp(score)
        att = exp_s / (exp_s.sum(axis=1, keepdims=True) + 1e-12)
        self.att = att                   # (N,T',1)

        # context: (N, 2H)
        h_att = (hs * att).sum(axis=1)   # (N,2H)
        self.h_att = h_att

        # FC
        h1 = self.fc1.forward(h_att)     # (N,64)
        h1 = self.relu_fc.forward(h1)
        self.h1 = h1

        logits = self.fc2.forward(h1)    # (N,12)
        return logits

    # ----- Backward -----
    def backward(self, dlogits):
        """
        dlogits: (N, num_classes)
        """
        # FC2
        dh1 = self.fc2.backward(dlogits)     # (N,64)
        dh1 = self.relu_fc.backward(dh1)     # (N,64)
        dh_att = self.fc1.backward(dh1)      # (N,2H)

        # ----- Attention backward -----
        hs = self.hs          # (N,T,2H)
        att = self.att        # (N,T,1)
        N, T, HH = hs.shape

        # context = sum_t att_t * hs_t
        # → dhs_from_context
        dh_att_exp = dh_att[:, None, :]      # (N,1,2H)
        dhs_ctx = att * dh_att_exp           # (N,T,2H)

        # d alpha
        # alpha_t = att_t
        # dL/dalpha_t = dh_att · hs_t
        dalpha = np.sum(dh_att_exp * hs, axis=2, keepdims=True)   # (N,T,1)

        # softmax backward: ds = alpha * (dalpha - sum(dalpha*alpha))
        sum_dalpha_alpha = np.sum(dalpha * att, axis=1, keepdims=True)  # (N,1,1)
        ds = att * (dalpha - sum_dalpha_alpha)                          # (N,T,1)

        # d w_att
        # score_t = hs_t @ w_att → dL/dw = sum_{n,t} hs[n,t]^T * ds[n,t]
        d_att_w = np.zeros_like(self.att_w)     # (2H,1)
        # hs: (N,T,2H), ds: (N,T,1)
        d_att_w[:, 0] = np.sum(hs * ds, axis=(0,1))

        # d hs from score: ds @ w_att^T
        dhs_score = ds @ self.att_w.T           # (N,T,2H)

        dhs_total = dhs_ctx + dhs_score         # (N,T,2H)
        self.d_att_w = d_att_w

        # ----- BiLSTM backward -----
        dz = self.bilstm.backward(dhs_total)    # (N,T',64)

        # ----- CNN backward -----
        dz = self.pool1.backward(dz)            # (N, 96, 64)
        dz = self.relu1.backward(dz)
        dz = self.conv1.backward(dz)            # conv1.dW, conv1.db 계산됨

    # ----- Step (Adam + clipping) -----
    def step(self):
        params = [
            (self.conv1.W, self.conv1.dW, self.conv1.mW, self.conv1.vW),
            (self.conv1.b, self.conv1.db, self.conv1.mb, self.conv1.vb),

            (self.bilstm.fwd.Wx, self.bilstm.fwd.dWx, self.bilstm.fwd.mWx, self.bilstm.fwd.vWx),
            (self.bilstm.fwd.Wh, self.bilstm.fwd.dWh, self.bilstm.fwd.mWh, self.bilstm.fwd.vWh),
            (self.bilstm.fwd.b , self.bilstm.fwd.db , self.bilstm.fwd.mb , self.bilstm.fwd.vb),

            (self.bilstm.bwd.Wx, self.bilstm.bwd.dWx, self.bilstm.bwd.mWx, self.bilstm.bwd.vWx),
            (self.bilstm.bwd.Wh, self.bilstm.bwd.dWh, self.bilstm.bwd.mWh, self.bilstm.bwd.vWh),
            (self.bilstm.bwd.b , self.bilstm.bwd.db , self.bilstm.bwd.mb , self.bilstm.bwd.vb),

            (self.att_w, self.d_att_w, self.m_att, self.v_att),

            (self.fc1.W, self.fc1.dW, self.fc1.mW, self.fc1.vW),
            (self.fc1.b, self.fc1.db, self.fc1.mb, self.fc1.vb),
            (self.fc2.W, self.fc2.dW, self.fc2.mW, self.fc2.vW),
            (self.fc2.b, self.fc2.db, self.fc2.mb, self.fc2.vb)
        ]

        # gradient clipping
        for _, g, _, _ in params:
            np.clip(g, -self.clip, self.clip, out=g)

        # Adam 업데이트
        for p, g, m, v in params:
            self._adam(p, g, m, v)

    def predict(self, X):
        logits = self.forward(X)
        probs = softmax(logits)
        return np.argmax(probs, axis=1) + 1


# 4. Cross-validation 함수

def run_cnn_bilstm_att_crossval(
    X_all, y_all, s_all, pairs,
    epochs=8, batch=32, lr=5e-4
):
    f1s = []
    for i, pair in enumerate(pairs, 1):
        print(f"\n=== Fold {i} | Test subjects={pair} ===")

        X_tr, y_tr, X_te, y_te = split_by_subject(X_all, y_all, s_all, pair)
        X_tr, X_te, mean, std = standardize(X_tr, X_te)

        model = CNN_BiLSTM_Att_Stable(
            T=X_tr.shape[1],
            D=X_tr.shape[2],
            H=96,
            num_classes=12
        )
        model.lr = lr

        N = len(X_tr)
        idx = np.arange(N)

        for ep in range(1, epochs+1):
            np.random.shuffle(idx)
            losses = []

            for j in range(0, N, batch):
                b = idx[j:j+batch]
                xb = X_tr[b]
                yb = y_tr[b]
                yb_oh = one_hot(yb, 12)

                logits = model.forward(xb)
                probs = softmax(logits)
                loss = cross_entropy(probs, yb_oh)
                losses.append(loss)

                dlogits = (probs - yb_oh) / len(b)
                model.backward(dlogits)
                model.step()

            # 에폭마다 테스트 F1 출력
            pred = model.predict(X_te)
            f1 = macro_f1(y_te, pred, 12)
            print(f"[Epoch {ep}] loss={np.mean(losses):.4f} | F1={f1:.4f}")

        # 폴드 최종 평가
        pred = model.predict(X_te)
        f1 = macro_f1(y_te, pred, 12)
        acc = np.mean(pred == y_te)
        print(f"Fold {i} done. Acc={acc:.4f} | F1={f1:.4f}")
        f1s.append(f1)

    print("\n==== Summary ====")
    print("Per-fold F1:", [f"{v:.4f}" for v in f1s])
    print("Mean F1:", np.mean(f1s))
    return f1s, float(np.mean(f1s))

# 5. 실행

f1s_best, mean_f1_best = run_cnn_bilstm_att_crossval(
    X_all, y_all, s_all, pairs,
    epochs=8, batch=32, lr=5e-4
)

print("\n최종 평균 Macro-F1 (CNN+BiLSTM+Att, 안정형):", mean_f1_best)



=== Fold 1 | Test subjects=(1, 2) ===
[Epoch 1] loss=0.4147 | F1=0.7291
[Epoch 2] loss=0.0176 | F1=0.7853
[Epoch 3] loss=0.0092 | F1=0.7223
[Epoch 4] loss=0.0039 | F1=0.8132
[Epoch 5] loss=0.0029 | F1=0.8042
[Epoch 6] loss=0.0010 | F1=0.8046
[Epoch 7] loss=0.0005 | F1=0.8048
[Epoch 8] loss=0.0004 | F1=0.8091
Fold 1 done. Acc=0.8249 | F1=0.8091

=== Fold 2 | Test subjects=(3, 4) ===
[Epoch 1] loss=0.4489 | F1=0.8928
[Epoch 2] loss=0.0276 | F1=0.9106
[Epoch 3] loss=0.0160 | F1=0.9030
[Epoch 4] loss=0.0080 | F1=0.9136
[Epoch 5] loss=0.0039 | F1=0.9054
[Epoch 6] loss=0.0012 | F1=0.9057
[Epoch 7] loss=0.0006 | F1=0.9092
[Epoch 8] loss=0.0004 | F1=0.9090
Fold 2 done. Acc=0.9079 | F1=0.9090

=== Fold 3 | Test subjects=(5, 6) ===
[Epoch 1] loss=0.4311 | F1=0.8164
[Epoch 2] loss=0.0388 | F1=0.8944
[Epoch 3] loss=0.0193 | F1=0.8830
[Epoch 4] loss=0.0067 | F1=0.8928
[Epoch 5] loss=0.0072 | F1=0.8943
[Epoch 6] loss=0.0065 | F1=0.8954
[Epoch 7] loss=0.0026 | F1=0.8981
[Epoch 8] loss=0.0008 | F1=0.