In [2]:
import os
import time
import copy
import torch
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

import io
import contextlib

try:
    from fvcore.nn import FlopCountAnalysis
    FVCORE_AVAILABLE = True
except ImportError:
    FVCORE_AVAILABLE = False

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

# ------------------------------------------------------------------------------
# 1. UniMiBARDataset
# ------------------------------------------------------------------------------
class UniMiBARDataset(Dataset):
    def __init__(self, data_path, split='train', train_ratio=0.8, seed=42):
        self.split = split

        # 1. 파일 로드 (파일명이 대소문자 구분이 있을 수 있으니 확인 필요)
        # 예: adl_data.npy, adl_labels.npy
        x_path = os.path.join(data_path, 'adl_data.npy')
        y_path = os.path.join(data_path, 'adl_labels.npy')

        # 데이터가 없으면 에러 발생
        if not os.path.exists(x_path):
            raise FileNotFoundError(f"File not found: {x_path}")

        raw_x = np.load(x_path)
        raw_y = np.load(y_path)

        # 2. 데이터 전처리
        # (N, 453) -> (N, 3, 151) -> (N, 151, 3)
        # UniMiB는 보통 [x...x, y...y, z...z] 순서
        if raw_x.ndim == 2:
            raw_x = raw_x.reshape(-1, 3, 151).transpose(0, 2, 1)

        # 라벨 처리: (N, 3) -> 첫번째 컬럼(Action ID)만 사용
        # 클래스가 1~9 이므로 0~8로 변환하기 위해 -1
        if raw_y.ndim > 1:
            raw_y = raw_y[:, 0]

        self.y_all = (raw_y - 1).astype(np.int64)
        self.X_all = raw_x.astype(np.float32)

        # 3. Train / Test 분할 (고정 시드 사용)
        total_len = len(self.X_all)
        indices = np.arange(total_len)

        np.random.seed(seed)
        np.random.shuffle(indices)

        split_idx = int(total_len * train_ratio)

        if split == 'train':
            self.indices = indices[:split_idx]
        else:
            self.indices = indices[split_idx:]

        print(f"[{split.upper()}] Loaded UniMiB-SHAR ADL: {len(self.indices)} samples (Shape: {self.X_all.shape[1:]})")

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        real_idx = self.indices[idx]
        return torch.FloatTensor(self.X_all[real_idx]), torch.LongTensor([self.y_all[real_idx]])[0]


# ------------------------------------------------------------------------------
# 2. ASF Model Components
# ------------------------------------------------------------------------------

class LatentEncoder(nn.Module):
    def __init__(self, input_channels=9, latent_dim=64):
        super().__init__()
        self.conv1 = nn.Conv1d(input_channels, 32, kernel_size=5, padding=2)
        self.bn1 = nn.BatchNorm1d(32)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=5, padding=2)
        self.bn2 = nn.BatchNorm1d(64)
        self.conv3 = nn.Conv1d(64, latent_dim, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(latent_dim)

    def forward(self, x):
        x = x.transpose(1, 2)
        h = F.relu(self.bn1(self.conv1(x)))
        h = F.relu(self.bn2(self.conv2(h)))
        s = F.relu(self.bn3(self.conv3(h)))
        s = s.transpose(1, 2)
        return s

class StandardCNN(nn.Module):
    def __init__(self, input_channels=9, latent_dim=64, num_classes=6, hidden_dim=64):
        super().__init__()

        # 1. ASF-DCL과 동일한 Encoder
        self.latent_encoder = LatentEncoder(input_channels, latent_dim)

        # 2. Flow 모듈 없이 단순한 Classifier (Pooling 후 FC)
        self.classifier = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x):
        # [Batch, Time, Dim]
        s = self.latent_encoder(x)

        # Global Average Pooling (시간 축 평균)
        s_pool = torch.mean(s, dim=1)

        logits = self.classifier(s_pool)
        return logits

# ------------------------------------------------------------------------------
# 5. Train / Evaluation
# ------------------------------------------------------------------------------
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for batch in dataloader:
        x = batch[0].to(device)
        y = batch[1].to(device)

        optimizer.zero_grad()
        logits = model(x)
        loss = F.cross_entropy(logits, y, label_smoothing=0.05)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.detach().cpu().numpy())
        all_labels.extend(y.detach().cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return avg_loss, f1

def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            x = batch[0].to(device)
            y = batch[1].to(device)

            logits = model(x)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.detach().cpu().numpy())
            all_labels.extend(y.detach().cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return acc, f1

def measure_efficiency(model, input_shape=(1, 128, 9), warmup=10, iters=100):
    """
    모델의 파라미터 수, FLOPs, 추론 속도를 측정합니다.
    CPU 환경에서 측정하여 하드웨어 의존성을 줄인 벤치마크를 수행합니다.
    """
    measure_device = torch.device('cpu')
    model_cpu = copy.deepcopy(model).to(measure_device)
    model_cpu.eval()

    # 더미 입력 데이터 생성 (Batch=1)
    real_input_shape = list(input_shape)
    real_input_shape[0] = 1
    sample_input = torch.randn(tuple(real_input_shape)).to(measure_device)

    # 1) 파라미터 수
    total_params = sum(p.numel() for p in model_cpu.parameters())
    params_m = total_params / 1e6  # million params

    # 2) FLOPs 측정 (fvcore 사용 가능할 때만)
    flops_m = None
    if FVCORE_AVAILABLE:
        try:
            with torch.no_grad():
                # 불필요한 출력 억제
                fake_out = io.StringIO()
                fake_err = io.StringIO()
                with contextlib.redirect_stdout(fake_out), contextlib.redirect_stderr(fake_err):
                    flops = FlopCountAnalysis(model_cpu, (sample_input,))
                    total_flops = flops.total()
                flops_m = total_flops / 1e6  # to millions
        except Exception as e:
            print(f"FLOPs calculation failed: {e}")
            flops_m = None

    # 3) 추론 시간 측정
    with torch.no_grad():
        # Warmup
        for _ in range(warmup):
            _ = model_cpu(sample_input)

        start = time.time()
        for _ in range(iters):
            _ = model_cpu(sample_input)
        end = time.time()

    avg_sec = (end - start) / iters
    inference_ms = avg_sec * 1000.0

    del model_cpu

    return {
        "params_m": params_m,
        "flops_m": flops_m,
        "inference_ms": inference_ms,
    }

# ------------------------------------------------------------------------------
# 6. Main Training Loop
# ------------------------------------------------------------------------------
def main():
    SEED = 42
    set_seed(SEED)

    DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/HAR_data/UniMiB-SHAR'
    BATCH_SIZE = 64
    NUM_EPOCHS = 50
    LEARNING_RATE = 0.001
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_dataset = UniMiBARDataset(DATA_PATH, split='train', train_ratio=0.8, seed=SEED)
    test_dataset = UniMiBARDataset(DATA_PATH, split='test', train_ratio=0.8, seed=SEED)

    g = torch.Generator()
    g.manual_seed(SEED)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, num_workers=2,
                              worker_init_fn=seed_worker,
                              generator=g)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=False, num_workers=2,
                             worker_init_fn=seed_worker,
                             generator=g)

    model = StandardCNN(
        input_channels=3,
        latent_dim=64,
        num_classes=9,
        hidden_dim=64
    ).to(DEVICE)

    total_params = sum(p.numel() for p in model.parameters())
    print()
    print(f"Total parameters: {total_params:,}")

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=LEARNING_RATE,
                                 weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=NUM_EPOCHS
    )

    best_acc = 0.0
    best_f1 = 0.0
    best_model_wts = copy.deepcopy(model.state_dict())

    # --- 학습 루프 ---
    print("\nStarting Training (Standard CNN)...")
    for epoch in range(NUM_EPOCHS):
        t_loss, t_f1 = train_epoch(model, train_loader, optimizer, DEVICE)

        v_acc, v_f1 = evaluate(model, test_loader, DEVICE)

        if v_f1 > best_f1:
            best_f1 = v_f1
            best_model_wts = copy.deepcopy(model.state_dict())

        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{NUM_EPOCHS}] Train F1: {t_f1:.4f} | Test F1: {v_f1:.4f} (Best: {best_f1:.4f})")

    metrics = measure_efficiency(model, input_shape=(1, 151, 3), warmup=10, iters=100)
    print(f"Inference Time   : {metrics['inference_ms']:.4f} ms / sample")

if __name__ == "__main__":
    main()

[TRAIN] Loaded UniMiB-SHAR ADL: 6063 samples (Shape: (151, 3))
[TEST] Loaded UniMiB-SHAR ADL: 1516 samples (Shape: (151, 3))

Total parameters: 28,233

Starting Training (Standard CNN)...
Epoch [10/50] Train F1: 0.7120 | Test F1: 0.7559 (Best: 0.7791)
Epoch [20/50] Train F1: 0.8026 | Test F1: 0.8660 (Best: 0.8751)
Epoch [30/50] Train F1: 0.8378 | Test F1: 0.9075 (Best: 0.9075)
Epoch [40/50] Train F1: 0.8901 | Test F1: 0.9150 (Best: 0.9150)
Epoch [50/50] Train F1: 0.9083 | Test F1: 0.9416 (Best: 0.9416)
Inference Time   : 0.4233 ms / sample
