In [1]:
import os
import time
import glob
import copy
import torch
import random
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score, f1_score

# ------------------------------------------------------------------------------
# 0. Utils & Setup
# ------------------------------------------------------------------------------
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

# ------------------------------------------------------------------------------
# 1. MEHATLH
# ------------------------------------------------------------------------------
def _load_single_mhealth_log(path: str, feature_cols: list[str]):
    """
    하나의 mHealth_subjectXX.log 파일을 로드해서 DataFrame으로 반환.
    """
    df = pd.read_csv(
        path,
        sep="\t",
        header=None,
        names=feature_cols + ["label"],
    )
    return df

def load_mhealth_dataframe(data_dir: str):
    """
    data_dir 안의 mHealth_subject*.log 전부 읽어서 하나의 DataFrame으로 concat.
    """
    feature_cols = [
        "acc_chest_x", "acc_chest_y", "acc_chest_z",      # 0, 1, 2
        "ecg_1", "ecg_2",                                 # 3, 4
        "acc_ankle_x", "acc_ankle_y", "acc_ankle_z",      # 5, 6, 7  <- Physics
        "gyro_ankle_x", "gyro_ankle_y", "gyro_ankle_z",   # 8, 9, 10 <- Physics
        "mag_ankle_x", "mag_ankle_y", "mag_ankle_z",      # 11, 12, 13
        "acc_arm_x", "acc_arm_y", "acc_arm_z",          # 14, 15, 16
        "gyro_arm_x", "gyro_arm_y", "gyro_arm_z",       # 17, 18, 19
        "mag_arm_x", "mag_arm_y", "mag_arm_z",          # 20, 21, 22
    ]  # 총 23 channels

    log_files = glob.glob(os.path.join(data_dir, "mHealth_subject*.log"))
    if not log_files:
         raise FileNotFoundError(f"No mHealth_subject*.log files found in {data_dir}")
    print(f"Found {len(log_files)} log files in {data_dir}")

    dfs = []
    for fp in log_files:
        df_i = _load_single_mhealth_log(fp, feature_cols)
        dfs.append(df_i)

    full_df = pd.concat(dfs, ignore_index=True)

    # Null 클래스(label==0)는 제외
    full_df = full_df[full_df["label"] != 0].copy()

    # 원래 라벨 1~12 → 0~11 로 shift
    full_df.loc[:, "label"] = full_df["label"] - 1

    return full_df, feature_cols

def create_mhealth_windows(
    df: pd.DataFrame,
    feature_cols: list[str],
    window_size: int,
    step_size: int,
):
    """
    전체 시계열을 (window_size, step_size)로 슬라이딩하면서 윈도우 생성.
    반환:
        X_np : (N, C, T) float32  <- (채널, 시간)
        y_np : (N,) int64         <- 0~11 로 이미 shift된 라벨
    """
    data_arr = df[feature_cols].to_numpy(dtype=np.float32)  # (L, 23)
    labels_arr = df["label"].to_numpy(dtype=np.int64)       # (L,)
    L = data_arr.shape[0]

    X_list = []
    y_list = []

    start = 0
    while start + window_size <= L:
        end = start + window_size
        window_x = data_arr[start:end]        # (T, C)
        window_label = labels_arr[end - 1]    # 마지막 타임스텝 라벨
        window_x_ct = np.transpose(window_x, (1, 0))  # (C, T)

        X_list.append(window_x_ct)
        y_list.append(int(window_label))
        start += step_size

    if not X_list:
        raise RuntimeError("No windows created. Check window_size / step_size / dataset length.")

    X_np = np.stack(X_list, axis=0).astype(np.float32)  # (N, C, T)
    y_np = np.array(y_list, dtype=np.int64)             # (N,)
    return X_np, y_np

class MHEALTHDataset(Dataset):
    """
    MHEALTH Dataset wrapper
    - X를 (N, T, C) 형태로 저장
    - __getitem__이 (X_tc, y, s) 튜플을 반환하도록 수정
    """
    def __init__(self, data_dir: str, window_size: int = 128, step_size: int = 64):
        super().__init__()

        # 1) 로그 로드 & 전처리
        full_df, feature_cols = load_mhealth_dataframe(data_dir)

        # 2) 슬라이딩 윈도우 생성
        X, y = create_mhealth_windows(
            df=full_df,
            feature_cols=feature_cols,
            window_size=window_size,
            step_size=step_size,
        ) # X: (N, C, T), y: (N,)

        # X를 (N, T, C) 형태로 저장 (UCI-HAR와 통일)
        self.X = np.transpose(X, (0, 2, 1)).astype(np.float32)
        self.y = y
        # 더미 subject ID 추가 (collate_fn 호환용)
        self.subjects = np.zeros(len(self.y), dtype=int) # (기존 코드 호환용)

        # 라벨 이름 (0~11 인덱스 기준)
        self.label_names = [
            "Standing still", "Sitting and relaxing", "Lying down",
            "Walking", "Climbing stairs", "Waist bends forward",
            "Frontal elevation of arms", "Knees bending", "Cycling",
            "Jogging", "Running", "Jump front & back",
        ]

        print("=" * 80)
        print("Loaded MHEALTH dataset")
        print(f"  X shape : {self.X.shape}  (N, T, C)")
        print(f"  y shape : {self.y.shape}  (N,)")
        print(f"  Classes : {len(self.label_names)}")
        print("=" * 80)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx: int):
        """
        (T, C) 텐서, 라벨, 더미 서브젝트 ID 반환
        """
        return (torch.FloatTensor(self.X[idx]),       # (T, C)
                torch.LongTensor([self.y[idx]])[0], # scalar
                self.subjects[idx])                 # scalar (dummy)

# ------------------------------------------------------------------------------
# 2. Baseline Model Components
# ------------------------------------------------------------------------------
# ASF-DCL과 공정한 비교를 위해 동일한 Encoder 구조 사용
class LatentEncoder(nn.Module):
    def __init__(self, input_channels=9, latent_dim=64):
        super().__init__()
        self.conv1 = nn.Conv1d(input_channels, 32, kernel_size=5, padding=2)
        self.bn1 = nn.BatchNorm1d(32)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=5, padding=2)
        self.bn2 = nn.BatchNorm1d(64)
        self.conv3 = nn.Conv1d(64, latent_dim, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(latent_dim)

    def forward(self, x):
        x = x.transpose(1, 2)
        h = F.relu(self.bn1(self.conv1(x)))
        h = F.relu(self.bn2(self.conv2(h)))
        s = F.relu(self.bn3(self.conv3(h)))
        s = s.transpose(1, 2)
        return s

# Baseline Model: Encoder + Global Average Pooling + Classifier
class StandardCNN(nn.Module):
    def __init__(self, input_channels=9, latent_dim=64, num_classes=6, hidden_dim=64):
        super().__init__()
        self.latent_encoder = LatentEncoder(input_channels, latent_dim)

        # Flow 모듈 없이 바로 분류 (일반적인 CNN 구조)
        self.classifier = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x):
        # 1. Encode: [Batch, Time, Dim]
        s = self.latent_encoder(x)

        # 2. Global Average Pooling (Time 축 평균)
        s_pool = torch.mean(s, dim=1)

        # 3. Classify
        logits = self.classifier(s_pool)
        return logits

# ------------------------------------------------------------------------------
# 3. Train & Evaluate Functions (Baseline용)
# ------------------------------------------------------------------------------
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for batch in dataloader:
        x = batch[0].to(device)
        y = batch[1].to(device)

        optimizer.zero_grad()
        logits = model(x)
        loss = F.cross_entropy(logits, y, label_smoothing=0.05)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.detach().cpu().numpy())
        all_labels.extend(y.detach().cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return avg_loss, f1

def evaluate_with_noise(model, dataloader, device, sigma):
    """
    AWGN 노이즈를 주입하여 모델의 견고성을 평가하는 함수
    sigma: 노이즈 강도 (Standard Deviation)
    """
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            x = batch[0].to(device)
            y = batch[1].to(device)

            # --- Noise Injection ---
            if sigma > 0:
                noise = torch.randn_like(x) * sigma
                x = x + noise
            # -----------------------

            logits = model(x)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.detach().cpu().numpy())
            all_labels.extend(y.detach().cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return acc, f1

# ------------------------------------------------------------------------------
# 4. Main Execution
# ------------------------------------------------------------------------------
def main():
    # 설정 (기존과 동일하게 맞춤)
    SEED = 42
    set_seed(SEED)
    DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/HAR_data/MHEALTHDATASET'
    BATCH_SIZE = 64
    NUM_EPOCHS = 50
    LEARNING_RATE = 0.001
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    print(f"Running Standard 1D-CNN Baseline on {DEVICE}")

    # 데이터 로드
    full_dataset = MHEALTHDataset(DATA_PATH, window_size=128, step_size=64)

    total_size = len(full_dataset)
    train_size = int(total_size * 0.8)
    test_size = total_size - train_size

    train_dataset, test_dataset = random_split(
        full_dataset, [train_size, test_size],
        generator=torch.Generator().manual_seed(SEED)
    )

    g = torch.Generator()
    g.manual_seed(SEED)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                              worker_init_fn=seed_worker, generator=g)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False,
                             worker_init_fn=seed_worker, generator=g)

    # 모델 초기화
    model = StandardCNN(input_channels=23, latent_dim=64, num_classes=12).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=NUM_EPOCHS
    )

    best_f1 = 0.0
    best_model_wts = copy.deepcopy(model.state_dict())

    # --- 학습 루프 ---
    print("\nStarting Training (Standard CNN)...")
    for epoch in range(NUM_EPOCHS):
        t_loss, t_f1 = train_epoch(model, train_loader, optimizer, DEVICE)

        # Validation (Noise=0.0)
        v_acc, v_f1 = evaluate_with_noise(model, test_loader, DEVICE, sigma=0.0)

        if v_f1 > best_f1:
            best_f1 = v_f1
            best_model_wts = copy.deepcopy(model.state_dict())

        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{NUM_EPOCHS}] Train F1: {t_f1:.4f} | Test F1: {v_f1:.4f} (Best: {best_f1:.4f})")

    # --- 실험: AWGN Robustness ---
    print("\n" + "="*60)
    print(f" EXPERIMENT: Baseline Noise Robustness (Best F1: {best_f1:.4f})")
    print("="*60)

    # 최고 성능 모델 로드
    model.load_state_dict(best_model_wts)

    sigma_levels = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]

    print(f"{'Sigma':<10} | {'Accuracy':<10} | {'F1-Score':<10}")
    print("-" * 36)

    for sigma in sigma_levels:
        acc, f1 = evaluate_with_noise(model, test_loader, DEVICE, sigma=sigma)
        print(f"{sigma:<10.1f} | {acc:<10.4f} | {f1:<10.4f}")

    print("-" * 36)
    print("Baseline Experiment Completed.")

if __name__ == "__main__":
    main()

Running Standard 1D-CNN Baseline on cuda
Found 10 log files in /content/drive/MyDrive/Colab Notebooks/HAR_data/MHEALTHDATASET
Loaded MHEALTH dataset
  X shape : (5361, 128, 23)  (N, T, C)
  y shape : (5361,)  (N,)
  Classes : 12

Starting Training (Standard CNN)...
Epoch [10/50] Train F1: 0.9785 | Test F1: 0.9739 (Best: 0.9766)
Epoch [20/50] Train F1: 0.9815 | Test F1: 0.9774 (Best: 0.9774)
Epoch [30/50] Train F1: 0.9859 | Test F1: 0.9756 (Best: 0.9774)
Epoch [40/50] Train F1: 0.9871 | Test F1: 0.9766 (Best: 0.9774)
Epoch [50/50] Train F1: 0.9885 | Test F1: 0.9736 (Best: 0.9774)

 EXPERIMENT: Baseline Noise Robustness (Best F1: 0.9774)
Sigma      | Accuracy   | F1-Score  
------------------------------------
0.0        | 0.9814     | 0.9774    
0.1        | 0.9814     | 0.9774    
0.2        | 0.9814     | 0.9774    
0.3        | 0.9814     | 0.9774    
0.4        | 0.9814     | 0.9774    
0.5        | 0.9814     | 0.9774    
------------------------------------
Baseline Experiment Com