In [1]:
import os
import re
import time
import glob
import copy
import torch
import random
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score, f1_score

# ------------------------------------------------------------------------------
# 0. Utils & Setup
# ------------------------------------------------------------------------------
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

# ------------------------------------------------------------------------------
# 1. PAMAP2
# ------------------------------------------------------------------------------
# ========================
# PAMAP2 데이터 로드
# ========================
def create_pamap2_windows(df: pd.DataFrame, window_size: int, step_size: int):
    """
    subject별로 timestamp 순서대로 전체 시계열을 따라가며 슬라이딩 윈도우 생성.
    한 윈도우의 라벨은 마지막 프레임의 activityID.
    마지막 라벨이 0(Null/기타) 이면 그 윈도우는 버린다.

    Returns:
        X:          (N, C, T) float32
        y:          (N,) int64  (0..11로 리맵된 레이블)
        subj_ids:   (N,) int64
        label_names:list[str] 길이 12, new_index -> human-readable
    """

    # 사용할 피처들 (orientation*, heartrate, *_Temperature 등은 제외)
    feature_cols = [
        # hand
        "handAcc16_1","handAcc16_2","handAcc16_3",
        "handAcc6_1","handAcc6_2","handAcc6_3",
        "handGyro1","handGyro2","handGyro3",
        "handMagne1","handMagne2","handMagne3",
        # chest
        "chestAcc16_1","chestAcc16_2","chestAcc16_3",
        "chestAcc6_1","chestAcc6_2","chestAcc6_3",
        "chestGyro1","chestGyro2","chestGyro3",
        "chestMagne1","chestMagne2","chestMagne3",
        # ankle
        "ankleAcc16_1","ankleAcc16_2","ankleAcc16_3",
        "ankleAcc6_1","ankleAcc6_2","ankleAcc6_3",
        "ankleGyro1","ankleGyro2","ankleGyro3",
        "ankleMagne1","ankleMagne2","ankleMagne3",
    ]

    # PAMAP2 실제 activityID들 중 우리가 쓰는 12개 클래스만 남김
    # 순서 고정: 이 순서가 new class index 0..11이 된다.
    ORDERED_IDS = [1, 2, 3, 4, 5, 6, 7, 12, 13, 16, 17, 24]

    # 원본 activityID -> new index(0..11)
    old2new = {
        1: 0,   # Lying
        2: 1,   # Sitting
        3: 2,   # Standing
        4: 3,   # Walking
        5: 4,   # Running
        6: 5,   # Cycling
        7: 6,   # Nordic walking
        12: 7,  # Ascending stairs
        13: 8,  # Descending stairs
        16: 9,  # Vacuum cleaning
        17: 10, # Ironing
        24: 11, # Rope jumping
    }

    # new index -> 사람이 읽는 이름
    label_names = [
        "Lying",              # 0 -> orig 1
        "Sitting",            # 1 -> orig 2
        "Standing",           # 2 -> orig 3
        "Walking",            # 3 -> orig 4
        "Running",            # 4 -> orig 5
        "Cycling",            # 5 -> orig 6
        "Nordic walking",     # 6 -> orig 7
        "Ascending stairs",   # 7 -> orig 12
        "Descending stairs",  # 8 -> orig 13
        "Vacuum cleaning",    # 9 -> orig 16
        "Ironing",            # 10 -> orig 17
        "Rope jumping",       # 11 -> orig 24
    ]

    X_list = []
    y_list = []
    subj_list = []

    # subject별로 끊어서 시간 순 정렬 후 슬라이딩 윈도우
    for subj_id, g in df.groupby("subject_id"):
        # 시간순 정렬
        if "timestamp" in g.columns:
            g = g.sort_values("timestamp")
        else:
            g = g.sort_index()

        data_arr  = g[feature_cols].to_numpy(dtype=np.float32)   # (L, C)
        label_arr = g["activityID"].to_numpy(dtype=np.int64)     # (L,)
        L = data_arr.shape[0]

        start = 0
        while start + window_size <= L:
            end = start + window_size

            last_label_orig = int(label_arr[end - 1])

            # 0 = "other / null" → 버림
            if last_label_orig == 0:
                start += step_size
                continue

            # 우리가 쓰는 12개 클래스에 없는 라벨이면 버림
            if last_label_orig not in old2new:
                start += step_size
                continue

            # 윈도우 추출
            window_ct = data_arr[start:end].T  # (T, C) -> (C, T)

            X_list.append(window_ct)
            y_list.append(old2new[last_label_orig])
            subj_list.append(int(subj_id))

            start += step_size

    # numpy 변환
    X = np.stack(X_list, axis=0).astype(np.float32)      # (N, C, T)
    y = np.asarray(y_list, dtype=np.int64)               # (N,)
    subj_ids = np.asarray(subj_list, dtype=np.int64)     # (N,)

    return X, y, subj_ids, label_names

class PAMAP2Dataset(Dataset):
    def __init__(self, data_dir, window_size, step_size):
        super().__init__()

        # 1) CSV 전부 읽어서 하나의 df로 합치기
        csv_files = glob.glob(os.path.join(data_dir, "*.csv"))
        if len(csv_files) == 0:
            raise RuntimeError(f"No CSV files found under {data_dir}")

        dfs = []
        for fpath in sorted(csv_files):
            df_i = pd.read_csv(fpath)

            if "subject_id" not in df_i.columns:
                m = re.findall(r"\d+", os.path.basename(fpath))
                subj_guess = int(m[0]) if len(m) > 0 else 0
                df_i["subject_id"] = subj_guess

            dfs.append(df_i)

        df = pd.concat(dfs, ignore_index=True)

        df = df.dropna(subset=['activityID'])

        # 기본 타입 정리
        df["activityID"] = df["activityID"].astype(np.int64)
        df["subject_id"] = df["subject_id"].astype(np.int64)
        if "timestamp" in df.columns:
            df["timestamp"] = pd.to_numeric(df["timestamp"], errors="coerce")

        # ===========================
        # (1) NaN 처리
        # ===========================
        feature_cols = [
            # hand
            "handAcc16_1","handAcc16_2","handAcc16_3",
            "handAcc6_1","handAcc6_2","handAcc6_3",
            "handGyro1","handGyro2","handGyro3",
            "handMagne1","handMagne2","handMagne3",
            # chest
            "chestAcc16_1","chestAcc16_2","chestAcc16_3",
            "chestAcc6_1","chestAcc6_2","chestAcc6_3",
            "chestGyro1","chestGyro2","chestGyro3",
            "chestMagne1","chestMagne2","chestMagne3",
            # ankle
            "ankleAcc16_1","ankleAcc16_2","ankleAcc16_3",
            "ankleAcc6_1","ankleAcc6_2","ankleAcc6_3",
            "ankleGyro1","ankleGyro2","ankleGyro3",
            "ankleMagne1","ankleMagne2","ankleMagne3",
        ]

        # subject별로 결측치 보간 -> ffill/bfill로 마저 메우기
        def _fill_subject_group(g):
            # 시간 순으로 정렬 (timestamp 있으면 timestamp 기준)
            if "timestamp" in g.columns:
                g = g.sort_values("timestamp")
            else:
                g = g.sort_index()

            # 각 컬럼별로 interpolate + ffill/bfill
            g[feature_cols] = (
                g[feature_cols]
                .interpolate(method="linear", limit_direction="both", axis=0)
                .ffill()
                .bfill()
            )
            return g

        df = df.groupby("subject_id", group_keys=False).apply(_fill_subject_group)

        # 이 시점에서 feature_cols 안에 NaN이 남아있으면 안 됨
        # 혹시라도 남았으면 0으로 막아버리기 (safety net)
        df[feature_cols] = df[feature_cols].fillna(0.0)

        # ===========================
        # (2) 스케일 표준화
        # ===========================
        scaler = StandardScaler()
        df[feature_cols] = scaler.fit_transform(df[feature_cols])

        # ===========================
        # (3) 윈도우 생성
        # ===========================
        X, y, subj_ids, label_names = create_pamap2_windows(
            df,
            window_size=window_size,
            step_size=step_size,
        )

        self.X = np.transpose(X, (0, 2, 1))
        self.y = y          # (N,)
        self.subject_ids = subj_ids
        self.label_names = label_names

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return (
            torch.from_numpy(self.X[idx]).float(),
            torch.tensor(self.y[idx], dtype=torch.long),
            self.subject_ids[idx]
        )

# ------------------------------------------------------------------------------
# 2. Baseline Model Components
# ------------------------------------------------------------------------------
# ASF-DCL과 공정한 비교를 위해 동일한 Encoder 구조 사용
class LatentEncoder(nn.Module):
    def __init__(self, input_channels=9, latent_dim=64):
        super().__init__()
        self.conv1 = nn.Conv1d(input_channels, 32, kernel_size=5, padding=2)
        self.bn1 = nn.BatchNorm1d(32)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=5, padding=2)
        self.bn2 = nn.BatchNorm1d(64)
        self.conv3 = nn.Conv1d(64, latent_dim, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(latent_dim)

    def forward(self, x):
        x = x.transpose(1, 2)
        h = F.relu(self.bn1(self.conv1(x)))
        h = F.relu(self.bn2(self.conv2(h)))
        s = F.relu(self.bn3(self.conv3(h)))
        s = s.transpose(1, 2)
        return s

# Baseline Model: Encoder + Global Average Pooling + Classifier
class StandardCNN(nn.Module):
    def __init__(self, input_channels=9, latent_dim=64, num_classes=6, hidden_dim=64):
        super().__init__()
        self.latent_encoder = LatentEncoder(input_channels, latent_dim)

        # Flow 모듈 없이 바로 분류 (일반적인 CNN 구조)
        self.classifier = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x):
        # 1. Encode: [Batch, Time, Dim]
        s = self.latent_encoder(x)

        # 2. Global Average Pooling (Time 축 평균)
        s_pool = torch.mean(s, dim=1)

        # 3. Classify
        logits = self.classifier(s_pool)
        return logits

# ------------------------------------------------------------------------------
# 3. Train & Evaluate Functions (Baseline용)
# ------------------------------------------------------------------------------
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for batch in dataloader:
        x = batch[0].to(device)
        y = batch[1].to(device)

        optimizer.zero_grad()
        logits = model(x)
        loss = F.cross_entropy(logits, y, label_smoothing=0.05)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.detach().cpu().numpy())
        all_labels.extend(y.detach().cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return avg_loss, f1

def evaluate_with_noise(model, dataloader, device, sigma):
    """
    AWGN 노이즈를 주입하여 모델의 견고성을 평가하는 함수
    sigma: 노이즈 강도 (Standard Deviation)
    """
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            x = batch[0].to(device)
            y = batch[1].to(device)

            # --- Noise Injection ---
            if sigma > 0:
                noise = torch.randn_like(x) * sigma
                x = x + noise
            # -----------------------

            logits = model(x)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.detach().cpu().numpy())
            all_labels.extend(y.detach().cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return acc, f1

# ------------------------------------------------------------------------------
# 4. Main Execution
# ------------------------------------------------------------------------------
def main():
    # 설정 (기존과 동일하게 맞춤)
    SEED = 42
    set_seed(SEED)
    DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/HAR_data'
    BATCH_SIZE = 64
    NUM_EPOCHS = 50
    LEARNING_RATE = 0.001
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    print(f"Running Standard 1D-CNN Baseline on {DEVICE}")

    # 데이터 로드
    full_dataset = PAMAP2Dataset(DATA_PATH, window_size=128, step_size=64)

    total_size = len(full_dataset)
    train_size = int(total_size * 0.8)
    test_size = total_size - train_size

    train_dataset, test_dataset = random_split(
        full_dataset, [train_size, test_size],
        generator=torch.Generator().manual_seed(SEED)
    )

    g = torch.Generator()
    g.manual_seed(SEED)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                              worker_init_fn=seed_worker, generator=g)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False,
                             worker_init_fn=seed_worker, generator=g)

    # 모델 초기화
    model = StandardCNN(input_channels=36, latent_dim=64, num_classes=12).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=NUM_EPOCHS
    )

    best_f1 = 0.0
    best_model_wts = copy.deepcopy(model.state_dict())

    # --- 학습 루프 ---
    print("\nStarting Training (Standard CNN)...")
    for epoch in range(NUM_EPOCHS):
        t_loss, t_f1 = train_epoch(model, train_loader, optimizer, DEVICE)

        # Validation (Noise=0.0)
        v_acc, v_f1 = evaluate_with_noise(model, test_loader, DEVICE, sigma=0.0)

        if v_f1 > best_f1:
            best_f1 = v_f1
            best_model_wts = copy.deepcopy(model.state_dict())

        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{NUM_EPOCHS}] Train F1: {t_f1:.4f} | Test F1: {v_f1:.4f} (Best: {best_f1:.4f})")

    # --- 실험: AWGN Robustness ---
    print("\n" + "="*60)
    print(f" EXPERIMENT: Baseline Noise Robustness (Best F1: {best_f1:.4f})")
    print("="*60)

    # 최고 성능 모델 로드
    model.load_state_dict(best_model_wts)

    sigma_levels = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]

    print(f"{'Sigma':<10} | {'Accuracy':<10} | {'F1-Score':<10}")
    print("-" * 36)

    for sigma in sigma_levels:
        acc, f1 = evaluate_with_noise(model, test_loader, DEVICE, sigma=sigma)
        print(f"{sigma:<10.1f} | {acc:<10.4f} | {f1:<10.4f}")

    print("-" * 36)
    print("Baseline Experiment Completed.")

if __name__ == "__main__":
    main()

Running Standard 1D-CNN Baseline on cuda


  df = df.groupby("subject_id", group_keys=False).apply(_fill_subject_group)



Starting Training (Standard CNN)...
Epoch [10/50] Train F1: 0.9700 | Test F1: 0.9741 (Best: 0.9741)
Epoch [20/50] Train F1: 0.9801 | Test F1: 0.9770 (Best: 0.9770)
Epoch [30/50] Train F1: 0.9812 | Test F1: 0.9806 (Best: 0.9810)
Epoch [40/50] Train F1: 0.9851 | Test F1: 0.9841 (Best: 0.9841)
Epoch [50/50] Train F1: 0.9874 | Test F1: 0.9822 (Best: 0.9841)

 EXPERIMENT: Baseline Noise Robustness (Best F1: 0.9841)
Sigma      | Accuracy   | F1-Score  
------------------------------------
0.0        | 0.9847     | 0.9841    
0.1        | 0.9845     | 0.9839    
0.2        | 0.9842     | 0.9836    
0.3        | 0.9842     | 0.9833    
0.4        | 0.9825     | 0.9815    
0.5        | 0.9816     | 0.9806    
------------------------------------
Baseline Experiment Completed.
