In [7]:
# -*- coding: utf-8 -*-
"""
IVDD binary classification (2-class: ivdd / normal) from DeepLabCut CSV (3-level header)
- 5 keypoints × (x,y) = 10 dims（likelihood 低品質点は補間）
- Windowing: SEQ_LEN=60, STRIDE=30
- Model: TimeDistributed(Dense->ReLU) -> LSTM -> LSTM -> Dense(2 logits)
- 学習率固定 + ReduceLROnPlateau（EarlyStoppingなし）
- 出力先はあなたの新ディレクトリ構成に準拠
    data/train/train_csv           ... 学習CSV
    data/train/fig                 ... 学習曲線（1枚）
    data/train/train1_model        ... 学習モデル(best/final)
    data/train/val_misclassified   ... 検証の誤分類ウィンドウ一覧CSV
"""

import os, re, glob, math, json
from datetime import datetime

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from pathlib import Path


def detect_project_root() -> Path:
    # 1) 環境変数があれば最優先（推奨：固定化できます）
    env = os.environ.get("IVDD_PROJECT_ROOT")
    if env:
        p = Path(env).expanduser().resolve()
        if (p / "data").is_dir() or (p / "scripts").is_dir():
            return p

    # 2) スクリプト(.py)として動かす場合は __file__ から推定
    try:
        here = Path(__file__).resolve()
        if here.parent.name == "scripts":
            # …/scripts/ 配下のファイルならその親をrootに
            cand = here.parent.parent
            if (cand / "data").is_dir() or (cand / "scripts").is_dir():
                return cand
        # 直下に data や scripts があればそこをrootに
        if (here.parent / "data").is_dir() or (here.parent / "scripts").is_dir():
            return here.parent
    except NameError:
        # Notebook 実行時は __file__ が無い
        pass

    # 3) Notebook / どこで走らせてもOK：CWDから上に辿って scripts と data を持つ場所を探す
    cwd = Path.cwd()
    for cand in [cwd] + list(cwd.parents):
        if (cand / "scripts").is_dir() and (cand / "data").is_dir():
            return cand

    # 4) 最後の手段：scripts 直下ならその親、そうでなければCWD
    return cwd.parent if cwd.name == "scripts" else cwd

PROJ_ROOT = detect_project_root()
print(f"[INFO] Project root = {PROJ_ROOT}")

# 以降は PROJ_ROOT を使って固定パスを組み立てる
TRAIN_DIR     = PROJ_ROOT / "data" / "train"
TRAIN_CSV_DIR = TRAIN_DIR / "train_csv"
FIG_DIR       = TRAIN_DIR / "fig"
MODEL_DIR     = TRAIN_DIR / "train1_model"
VALERR_DIR    = TRAIN_DIR / "val_misclassified"

# os.path を使っている既存コードに合わせるなら str() を付けてください
CSV_GLOB = str(TRAIN_CSV_DIR / "*.csv")
for d in [FIG_DIR, MODEL_DIR, VALERR_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# ========= 設定 =========
KEYPOINTS = [
    "left back paw",
    "right back paw",
    "left front paw",
    "right front paw",
    "tail set",
]

USE_LIKELIHOOD      = False
MIN_KEEP_LIKELIHOOD = 0.6

SEQ_LEN   = 60
STRIDE    = 30
DIMS      = 10
BATCH_SIZE = 30
EPOCHS     = 100

LR = 1e-4
VAL_SPLIT_BY_FILE = True

CLASS_NAMES  = ["ivdd", "normal"]
CLASS_TO_IDX = {c:i for i,c in enumerate(CLASS_NAMES)}
N_CLASSES    = 2

tf.random.set_seed(42)
np.random.seed(42)

# ランごとユニークID（日時）
DATE_STR = datetime.now().strftime("%Y%m%d-%H%M%S")


# ========= ユーティリティ =========
def _norm_name(s: str) -> str:
    return "".join(ch for ch in s.lower() if ch not in " _-")

def _resolve_keypoints(all_bodyparts, requested):
    norm2orig = {}
    for bp in all_bodyparts:
        k = _norm_name(bp)
        if k not in norm2orig:
            norm2orig[k] = bp
    resolved, missing = [], []
    for req in requested:
        k = _norm_name(req)
        if k in norm2orig:
            resolved.append(norm2orig[k])
        else:
            missing.append(req)
    if missing:
        raise ValueError(f"指定キーポイントがCSVで見つかりません: {missing}\n利用可能: {all_bodyparts}")
    return resolved

def infer_label_from_filename(filename: str) -> int:
    base = os.path.basename(filename).lower()
    ivdd_match   = re.search(r'(?<![a-z])ivdd(?![a-z])', base, flags=re.I)
    normal_match = re.search(r'(?<![a-z])normal(?![a-z])', base, flags=re.I)
    if ivdd_match and not normal_match:
        return CLASS_TO_IDX["ivdd"]
    if normal_match and not ivdd_match:
        return CLASS_TO_IDX["normal"]
    token = re.split(r'[_\-.]', base)[0]
    if token in CLASS_TO_IDX:
        return CLASS_TO_IDX[token]
    raise ValueError(f"ラベルを特定できません: {filename}（'ivdd' または 'normal' を単語として含めてください）")

def read_dlc_5kp_xy(csv_path: str,
                    keypoints,
                    use_likelihood=True,
                    min_keep_likelihood=0.6):
    # 3段ヘッダ
    df = pd.read_csv(csv_path, header=[0,1,2], index_col=0)
    bodyparts = list({bp for (_, bp, _) in df.columns})
    use_kps = _resolve_keypoints(bodyparts, keypoints)

    # (x,y) 抽出
    cols = {}
    for bp in use_kps:
        cols[f"{bp}_x"] = df.xs((bp, "x"), level=[1,2], axis=1)
        cols[f"{bp}_y"] = df.xs((bp, "y"), level=[1,2], axis=1)
    X_df = pd.concat(cols.values(), axis=1)
    X_df.columns = list(cols.keys())

    if use_likelihood:
        for bp in use_kps:
            try:
                lcol = df.xs((bp, "likelihood"), level=[1,2], axis=1).values.flatten()
                low = lcol < min_keep_likelihood
                for c in [f"{bp}_x", f"{bp}_y"]:
                    vals = X_df[c].values
                    vals[low] = np.nan
                    X_df[c] = vals
            except KeyError:
                pass

    X_df = X_df.interpolate(method="linear", limit_direction="both", axis=0)
    X_df = X_df.bfill().ffill().fillna(0.0)
    X = X_df.values.astype(np.float32)  # (T,10)
    return X, use_kps

def zscore_per_file(X: np.ndarray, eps: float=1e-6) -> np.ndarray:
    mu = X.mean(axis=0, keepdims=True)
    sd = X.std(axis=0, keepdims=True)
    return (X - mu) / (sd + eps)

def make_windows(X: np.ndarray, seq_len: int, stride: int):
    n = X.shape[0]
    if n < seq_len:
        return np.empty((0, seq_len, X.shape[1]), dtype=X.dtype), []
    starts = list(range(0, n - seq_len + 1, stride))
    Xw = np.stack([X[s:s+seq_len] for s in starts], axis=0)
    return Xw, starts

def build_dataset(csv_paths, seq_len=SEQ_LEN, stride=STRIDE):
    X_list, y_list, file_ids, starts_list = [], [], [], []
    used_kps_any = None
    for p in csv_paths:
        y_lab = infer_label_from_filename(p)
        X_raw, used_kps = read_dlc_5kp_xy(
            p,
            keypoints=KEYPOINTS,
            use_likelihood=USE_LIKELIHOOD,
            min_keep_likelihood=MIN_KEEP_LIKELIHOOD
        )
        if X_raw.shape[1] != DIMS:
            raise ValueError(f"{os.path.basename(p)}: 取り出し次元 {X_raw.shape[1]} != 期待 {DIMS}")

        X_raw = zscore_per_file(X_raw)
        X_win, starts = make_windows(X_raw, seq_len, stride)  # (M,T,D), [M]
        if X_win.shape[0] == 0:
            print(f"[WARN] {os.path.basename(p)}: フレーム不足（{seq_len}未満）でスキップ")
            continue

        X_list.append(X_win)
        y_list.append(np.full((X_win.shape[0],), y_lab, dtype=np.int64))
        file_ids += [os.path.basename(p)] * X_win.shape[0]
        starts_list += starts
        used_kps_any = used_kps

    if not X_list:
        raise RuntimeError("データが作れませんでした。CSV と命名規則（ivdd/normal）を確認してください。")

    X = np.concatenate(X_list, axis=0)
    y = np.concatenate(y_list, axis=0)
    file_ids = np.array(file_ids)
    starts_arr = np.array(starts_list)
    print(f"[INFO] 使用キーポイント実名: {used_kps_any}")
    return X, y, file_ids, starts_arr

# ========= モデル =========
class LSTM_RNN(keras.Model):
    def __init__(self, n_input, n_hidden, n_classes):
        super().__init__()
        self.input_dense = keras.layers.Dense(n_hidden, activation='relu')
        self.time_dist   = keras.layers.TimeDistributed(self.input_dense)
        self.lstm1 = keras.layers.LSTM(n_hidden, return_sequences=True)
        self.lstm2 = keras.layers.LSTM(n_hidden)
        self.out   = keras.layers.Dense(n_classes)  # logits

    def call(self, x, training=False):
        x = self.time_dist(x)
        x = self.lstm1(x, training=training)
        x = self.lstm2(x, training=training)
        x = self.out(x)
        return x

class LSTMWithL2(LSTM_RNN):
    def __init__(self, n_input, n_hidden, n_classes, l2_lambda=1e-4):
        super().__init__(n_input, n_hidden, n_classes)
        self.l2_lambda   = l2_lambda
        self.loss_fn     = keras.losses.CategoricalCrossentropy(from_logits=True)
        self.metric_acc  = keras.metrics.CategoricalAccuracy(name="accuracy")
        self.metric_loss = keras.metrics.Mean(name="loss")

    @property
    def metrics(self):
        return [self.metric_loss, self.metric_acc]

    def compile(self, optimizer, **kwargs):
        super().compile(optimizer=optimizer, **kwargs)

    def _l2(self):
        if not self.trainable_variables:
            return 0.0
        return self.l2_lambda * tf.add_n([tf.nn.l2_loss(v) for v in self.trainable_variables])

    def train_step(self, data):
        if isinstance(data, (list, tuple)) and len(data) == 3:
            x, y, sample_weight = data
        else:
            x, y = data
            sample_weight = None
        with tf.GradientTape() as tape:
            logits = self(x, training=True)
            loss = self.loss_fn(y, logits, sample_weight=sample_weight) + self._l2()
        grads = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
        self.metric_loss.update_state(loss)
        self.metric_acc.update_state(y, logits, sample_weight=sample_weight)
        return {"loss": self.metric_loss.result(), "accuracy": self.metric_acc.result()}

    def test_step(self, data):
        if isinstance(data, (list, tuple)) and len(data) == 3:
            x, y, sample_weight = data
        else:
            x, y = data
            sample_weight = None
        logits = self(x, training=False)
        loss = self.loss_fn(y, logits, sample_weight=sample_weight) + self._l2()
        self.metric_loss.update_state(loss)
        self.metric_acc.update_state(y, logits, sample_weight=sample_weight)
        return {"loss": self.metric_loss.result(), "accuracy": self.metric_acc.result()}

# ========= データ読み込み =========
csv_files = sorted(glob.glob(CSV_GLOB))
if not csv_files:
    raise FileNotFoundError(f"学習CSVが見つかりません: {CSV_GLOB}")

X, y, file_ids, starts = build_dataset(csv_files, SEQ_LEN, STRIDE)
print("X:", X.shape, "y:", y.shape, "unique files:", len(np.unique(file_ids)))

# one-hot
y_oh = keras.utils.to_categorical(y, num_classes=N_CLASSES)

# ========= バリデーション分割（ファイル単位） =========
if VAL_SPLIT_BY_FILE:
    uniq_files = np.unique(file_ids)
    tr_files, va_files = train_test_split(uniq_files, test_size=0.2, random_state=42, shuffle=True)
    tr_mask = np.isin(file_ids, tr_files)
    va_mask = np.isin(file_ids, va_files)
    X_train, y_train = X[tr_mask], y_oh[tr_mask]
    X_val,   y_val   = X[va_mask], y_oh[va_mask]
    # バリデーションの元CSV名とウィンドウ開始フレームも保持
    val_file_names = file_ids[va_mask]
    val_starts     = starts[va_mask]
else:
    X_train, X_val, y_train, y_val, idx_train, idx_val = train_test_split(
        X, y_oh, np.arange(len(X)), test_size=0.2, random_state=42, stratify=y
    )
    val_file_names = file_ids[idx_val]
    val_starts     = starts[idx_val]

# ========= クラス不均衡対策 =========
cls_w = compute_class_weight("balanced", classes=np.arange(N_CLASSES), y=np.argmax(y_train, axis=1))
class_weight = {int(c): float(w) for c, w in enumerate(cls_w)}
print("class_weight:", class_weight)

# ========= モデル・学習 =========
n_hidden = 30
model = LSTMWithL2(n_input=DIMS, n_hidden=n_hidden, n_classes=N_CLASSES, l2_lambda=1e-4)
opt = keras.optimizers.Adam(learning_rate=LR)
model.compile(optimizer=opt)

best_path  = os.path.join(MODEL_DIR, f"ivdd_lstm_{DATE_STR}_best.keras")
final_path = os.path.join(MODEL_DIR, f"ivdd_lstm_{DATE_STR}_final.keras")

callbacks = [
    keras.callbacks.ModelCheckpoint(
        best_path, monitor="val_accuracy", mode="max",
        save_best_only=True, verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", mode="min",
        factor=0.5, patience=5, min_lr=1e-6, verbose=1
    ),
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    class_weight=class_weight,
    verbose=1,
    callbacks=callbacks
)

# ========= 評価（バリデーション） & 誤分類CSV =========
y_val_prob = model.predict(X_val, batch_size=BATCH_SIZE)
y_val_pred = np.argmax(y_val_prob, axis=1)
y_val_true = np.argmax(y_val, axis=1)

print("\n[Validation] classification_report:")
print(classification_report(y_val_true, y_val_pred, target_names=CLASS_NAMES, digits=4))
cm = confusion_matrix(y_val_true, y_val_pred, labels=[0,1])
print("[Validation] confusion_matrix:\n", cm)

# 誤分類のみ抽出（ウィンドウ単位）
df_val = pd.DataFrame({
    "file": val_file_names,
    "start": val_starts,
    "true_idx": y_val_true,
    "pred_idx": y_val_pred,
    "true": [CLASS_NAMES[i] for i in y_val_true],
    "pred": [CLASS_NAMES[i] for i in y_val_pred],
    "p_ivdd": y_val_prob[:, CLASS_TO_IDX["ivdd"]],
    "p_normal": y_val_prob[:, CLASS_TO_IDX["normal"]],
})
df_mis = df_val[df_val["true_idx"] != df_val["pred_idx"]].copy()
mis_csv_path = os.path.join(VALERR_DIR, f"val_misclassified_{DATE_STR}.csv")
df_mis.to_csv(mis_csv_path, index=False, encoding="utf-8-sig")
print(f"[INFO] 誤分類ウィンドウ一覧を保存: {mis_csv_path} (rows={len(df_mis)})")

# ========= 学習曲線（loss & accuracy を1枚に） =========
fig_path = os.path.join(FIG_DIR, f"curve_{DATE_STR}.png")
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(history.history["loss"], label="train")
plt.plot(history.history["val_loss"], label="val")
plt.title("Loss"); plt.legend()

plt.subplot(1,2,2)
plt.plot(history.history["accuracy"], label="train")
plt.plot(history.history["val_accuracy"], label="val")
plt.title("Accuracy"); plt.legend()

plt.tight_layout()
plt.savefig(fig_path, dpi=150)
plt.close()
print(f"[INFO] 学習曲線を保存: {fig_path}")

# ========= 最終モデル保存 =========
model.save(final_path)
print(f"[INFO] Final model saved to: {final_path}")
print(f"[INFO] Best model saved to:  {best_path}")


[INFO] Project root = c:\kanno\vscode\RNN-for-Human-Activity-Recognition-using-2D-Pose-Input-master\RNN-for-Human-Activity-Recognition-using-2D-Pose-Input-master
[INFO] 使用キーポイント実名: ['left back paw', 'right back paw', 'left front paw', 'right front paw', 'tail set']
X: (1460, 60, 10) y: (1460,) unique files: 198
class_weight: {0: 0.9166666666666666, 1: 1.1}
Epoch 1/100
[1m37/39[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 11ms/step - accuracy: 0.4892 - loss: 0.7131
Epoch 1: val_accuracy improved from -inf to 0.49320, saving model to c:\kanno\vscode\RNN-for-Human-Activity-Recognition-using-2D-Pose-Input-master\RNN-for-Human-Activity-Recognition-using-2D-Pose-Input-master\data\train\train1_model\ivdd_lstm_20251211-045534_best.keras
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.4899 - loss: 0.7127 - val_accuracy: 0.4932 - val_loss: 0.7041 - learning_rate: 1.0000e-04
Epoch 2/100
[1m38/39[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[