# Import

In [1]:
import numpy as np
import pandas as pd
import polars as pl
import pickle
import torch
import os
import gc
import glob
import scipy.signal
import math
import json
import matplotlib.colors as mcolors
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
import wandb
from types import SimpleNamespace
import _MultiResUNet as MultiResUNet
import torch.nn as nn
import torch.optim as optim
import os
import random
import pickle
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns                           
import numpy as np


In [2]:
SAMPLE_RATE = 200  # Hz
SAMPLE_RATE_TARGET = 50  # Hz

MAX_TIME = 50        # sec
MAX_LENGTH = SAMPLE_RATE * MAX_TIME  # length of the sequence

WINDOW_SIZES = [0.3, 0.6, 1.2]  # 초 단위 윈도우 크기

BATCH_SIZE = 1

In [3]:
def load_config(filename):
    save_path = os.path.join('model_checkpoints', filename)
    with open(save_path, 'r') as f:
        config_dict = json.load(f)
    return SimpleNamespace(**config_dict)

config = load_config('2_config.json')
print(config)

namespace(SAVE_DIR='model_checkpoints', model_depth=8, model_width=32, kernel_size=5, problem_type='Classification', ds=True, ae=False, feature_number=512, is_transconv=True, learning_rate=1e-05)


In [4]:
# MAX_LENGTH_TARGET를 2 ** model_depth의 배수로 설정
factor = 2 ** config.model_depth
MAX_LENGTH_TARGET = math.ceil((SAMPLE_RATE_TARGET * MAX_TIME) / factor) * factor
print(f'Max recording time: {MAX_LENGTH_TARGET/SAMPLE_RATE_TARGET} sec')
# MAX_LENGTH_TARGET = SAMPLE_RATE_TARGET * MAX_TIME  # length of the sequence

## 2 ** n 형태로 만들기
# raw_max_length_target = SAMPLE_RATE_TARGET * MAX_TIME
# MAX_LENGTH_TARGET = 2 ** math.ceil(math.log2(raw_max_length_target))

Max recording time: 51.2 sec


# Make Examples

# Feature Engineering

In [5]:
class TimeSeriesFeatureEngineer:
    def __init__(self, window_sizes, sampling_rate):
        self.window_sizes = np.dot(window_sizes, sampling_rate).astype(int)
        self.encoder = None
        self.label_mapping = {
            'idle': 'walk',
            'rampascent': 'rampascent',
            'rampascent-walk': 'rampascent',
            'rampdescent': 'rampdescent',
            'rampdescent-walk': 'rampdescent',
            'stairascent': 'stairascent',
            'stairascent-walk': 'stairascent',
            'stairdescent': 'stairdescent',
            'stairdescent-walk': 'stairdescent',
            'stand': 'walk',
            'stand-walk': 'walk',
            'turn1': 'walk',
            'turn2': 'walk',
            'walk': 'walk',
            'walk-rampascent': 'rampascent',
            'walk-rampdescent': 'rampdescent',
            'walk-stairascent': 'stairascent',
            'walk-stairdescent': 'stairdescent',
            'walk-stand': 'walk'
        }

    def map_labels(self, Y_data):
        Y_data_mapped = []
        for y_seq in Y_data:
            Y_data_mapped.append(np.array([self.label_mapping[label] for label in y_seq]))
        return Y_data_mapped

    def create_encoder(self, Y_data):
        # 라벨 매핑
        Y_data_mapped = self.map_labels(Y_data)
        
        # 전체 라벨 수집
        all_labels = np.concatenate(Y_data_mapped)
        all_labels_unique = np.unique(all_labels).reshape(-1, 1)
        
        # OneHotEncoder를 사용하여 라벨 인코딩
        self.encoder = OneHotEncoder(sparse_output=False)
        self.encoder.fit(all_labels_unique)

        # 인코더의 라벨 출력
        print("Encoder classes:", self.encoder.categories_)
        return self.encoder

    def fit_transform_labels(self, Y_data):
        if self.encoder is None:
            raise ValueError("Encoder has not been created. Call create_encoder first.")
        
        # 라벨 매핑
        Y_data_mapped = self.map_labels(Y_data)
        
        # 각 Y_data를 원핫 인코딩
        Y_data_encoded_list = [self.encoder.transform(np.array(y).reshape(-1, 1)) for y in Y_data_mapped]
        return Y_data_encoded_list

    def feature_engineering(self, df: pl.DataFrame):
        # LazyFrame으로 변환하여 작업
        lf = df.lazy()
        
        for col in df.columns:
            for window in self.window_sizes:
                window_str = str(window)
                # 통계 값
                lf = lf.with_columns([
                    df[col].rolling_mean(window).alias(col + '_mean_' + window_str),
                    df[col].rolling_std(window).alias(col + '_std_' + window_str),
                    df[col].rolling_min(window).alias(col + '_min_' + window_str),
                    df[col].rolling_max(window).alias(col + '_max_' + window_str),
                    df[col].diff(window).alias(col + '_diff_' + window_str)
                ])
                for lag in range(1, 4):
                    lf = lf.with_columns([
                        df[col].shift(lag * window).alias(col + f'_lag_{lag}_' + window_str)
                    ])
        
        features_df = lf.collect().fill_nan(0).fill_null(0)
        return features_df

    def fit_transform_features(self, X_data):
        X_features = []
        for seq in X_data:
            seq_df = pl.DataFrame(seq)
            features_df = self.feature_engineering(seq_df)
            X_features.append(features_df.to_numpy())
        return X_features

    def resample_data(self, X_data, original_sampling_rate, target_sampling_rate):
        resampled_X_data = []
        for seq in X_data:
            resampled_seq = scipy.signal.resample(seq, int(len(seq) * target_sampling_rate / original_sampling_rate))
            resampled_X_data.append(resampled_seq)
        return resampled_X_data

    def fit(self, X_data, Y_data, original_sampling_rate, target_sampling_rate, train_dir="train_batches", val_dir="val_batches", test_size=0.2, max_workers=4):
        os.makedirs(train_dir, exist_ok=True)
        os.makedirs(val_dir, exist_ok=True)

        # Resample the data
        X_data_resampled = self.resample_data(X_data, original_sampling_rate, target_sampling_rate)

        # Statistics
        sequence_length = [len(seq) for seq in X_data_resampled]
        print(f'Max sequence length: {max(sequence_length)}')
        print(f'Min sequence length: {min(sequence_length)}')
        print(f'Mean sequence length: {np.mean(sequence_length)}')

        # Train/Val split
        X_train, X_val, Y_train, Y_val = train_test_split(X_data_resampled, Y_data, test_size=test_size, random_state=42)

        # 라벨 인코딩
        self.create_encoder(Y_data)
        Y_train_encoded = self.fit_transform_labels(Y_train)
        Y_val_encoded = self.fit_transform_labels(Y_val)

        # Train 데이터 저장
        self._process_and_save_individual(X_train, Y_train_encoded, train_dir, max_workers)
        # Val 데이터 저장
        self._process_and_save_individual(X_val, Y_val_encoded, val_dir, max_workers)


    def _process_and_save_individual(self, X_data, Y_data, save_dir, max_workers):
        def process_and_save(idx):
            X_features = self.fit_transform_features([X_data[idx]])[0]
            Y_encoded = Y_data[idx]
            
            with open(os.path.join(save_dir, f"X_data_{idx}.pkl"), 'wb') as f:
                pickle.dump(X_features, f)
            with open(os.path.join(save_dir, f"Y_data_{idx}.pkl"), 'wb') as f:
                pickle.dump(Y_encoded, f)
            
            del X_features, Y_encoded
            gc.collect()

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(process_and_save, idx) for idx in range(len(X_data))]
            for _ in tqdm(as_completed(futures), total=len(futures), desc=f"Processing data in {save_dir}", unit="sample"):
                pass

# Data Loader

In [6]:
# Load CSV files and apply feature engineering
def load_and_process_csv_files(test_folder, feature_engineer):
    csv_files = glob.glob(os.path.join(test_folder, '*.csv'))
    X_data = []
    
    for file in tqdm(csv_files, desc="Loading CSV files"):
        df = pd.read_csv(file)
        # Drop columns 'Header', 'time', or 'Time' if they exist
        columns_to_drop = [col for col in ['Header', 'time', 'Time'] if col in df.columns]
        if columns_to_drop:
            df = df.drop(columns=columns_to_drop)
        X_data.append(df.values)
    
    X_features = feature_engineer.fit_transform_features(X_data)
    return X_features

class TestTimeSeriesDataset(Dataset):
    def __init__(self, test_folder, feature_engineer, max_length):
        self.X_data = load_and_process_csv_files(test_folder, feature_engineer)
        self.max_length = max_length

    def __len__(self):
        return len(self.X_data)

    def __getitem__(self, idx):
        X_data = self.X_data[idx]
        X_padded = self.pad_or_trim_sequence(X_data)
        return X_padded

    def pad_or_trim_sequence(self, sequence):
        seq_len = len(sequence)
        feature_dim = sequence.shape[1] if len(sequence.shape) > 1 else 1

        if seq_len > self.max_length:
            return torch.tensor(sequence[:self.max_length], dtype=torch.float32)
        else:
            padding_length = self.max_length - seq_len
            if feature_dim > 1:
                padded_seq = np.pad(sequence, ((0, padding_length), (0, 0)), 'constant', constant_values=0)
            else:
                padded_seq = np.pad(sequence, (0, padding_length), 'constant', constant_values=0)
            return torch.tensor(padded_seq, dtype=torch.float32)

In [7]:
# Define parameters
test_folder = "test"
feature_engineer = TimeSeriesFeatureEngineer(WINDOW_SIZES, SAMPLE_RATE_TARGET)

# Create the test dataset and data loader
test_dataset = TestTimeSeriesDataset(test_folder=test_folder, feature_engineer=feature_engineer, max_length=MAX_LENGTH_TARGET)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

Loading CSV files: 100%|██████████| 10/10 [00:00<00:00, 114.03it/s]


In [8]:
# # Example usage
# for X_batch in test_loader:
#     print(X_batch.shape)
#     pass

# Eval

In [9]:
def predict(model, data_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    all_preds = []
    all_probabilities = []
    
    with torch.no_grad():
        for X_batch in data_loader:
            X_batch = X_batch.to(device)

            with autocast():
                outputs = model(X_batch)
                if isinstance(outputs, list):  # Deep Supervision
                    outputs = outputs[-1]  # Use the last output

                probs = torch.softmax(outputs, dim=2)  # Calculate probabilities for each class
                preds = torch.argmax(probs, dim=2)  # Get predicted class indices

                all_preds.append(preds.cpu().numpy())
                all_probabilities.append(probs.cpu().numpy())
    
    all_preds = np.concatenate(all_preds, axis=0)
    all_probabilities = np.concatenate(all_probabilities, axis=0)
    
    return all_preds, all_probabilities


In [10]:
def plot_probabilities(predictions, probabilities, class_names, save_dir, idx):
    num_classes = len(class_names)
    time_steps = probabilities.shape[1]

    fig, axes = plt.subplots(num_classes, 1, figsize=(10, num_classes * 2), sharex=True)

    if num_classes == 1:
        axes = [axes]

    predictions_one_hot = np.zeros((predictions.shape[0], predictions.shape[1], num_classes))
    for i in range(predictions.shape[0]):
        for t in range(predictions.shape[1]):
            predictions_one_hot[i, t, predictions[i, t]] = 1

    colors = list(mcolors.TABLEAU_COLORS.values())

    for i, class_name in enumerate(class_names):
        color_pred = colors[i % len(colors)]
        color_true = colors[(i + len(colors) // 2) % len(colors)]
        for j in range(predictions.shape[0]):
            axes[i].plot(range(time_steps), probabilities[j, :, i], label=f'Predicted', alpha=0.6, color=color_pred)
            axes[i].fill_between(range(time_steps), 0, probabilities[j, :, i], alpha=0.2, color=color_pred)
            axes[i].plot(range(time_steps), predictions_one_hot[j, :, i], linestyle='dashed', label=f'Predicted Label', alpha=0.6, color=color_true)
            axes[i].fill_between(range(time_steps), 0, predictions_one_hot[j, :, i], alpha=0.2, color=color_true)
        axes[i].set_ylabel('Probability', fontsize=14)
        axes[i].set_ylim(0, 1)
        axes[i].set_title(f'{class_name}', fontsize=18)
        axes[i].legend(fontsize=14)

    axes[-1].set_xlabel('Time Steps', fontsize=14)

    fig.suptitle(f'{idx}th Result', fontsize=24, y=0.99, x=0.85)
    plt.tight_layout(rect=[0, 0, 1, 1.02])

    # Ensure save directory exists
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    save_path = os.path.join(save_dir, f'test_{idx}_probabilities.png')
    plt.savefig(save_path, dpi=300)
    plt.close()


In [11]:
def plot_probabilities_for_all_trials(probabilities, predictions, class_names, save_dir):
    total_plots = probabilities.shape[0]
    for num in tqdm(range(total_plots), desc="Plotting probabilities", unit="plot"):
        plot_probabilities(predictions[num:num+1], probabilities[num:num+1], class_names, save_dir, num)


In [12]:
def load_model(model, path):
    model.load_state_dict(torch.load(path))
    return model

In [13]:
class_names = ['ramp ascent', 'ramp descent', 'stair ascent', 'stair descent', 'walk']

In [14]:
# 데이터 로더를 사용하여 모델의 길이, 채널 수 및 출력 채널 수 설정
first_batch = next(iter(test_loader))
length = first_batch[0].shape[0]
num_channel = first_batch[0].shape[1]
output_channels = len(class_names)

In [15]:
model = MultiResUNet.UNet(length=length, model_depth=config.model_depth, num_channel=num_channel, model_width=config.model_width, kernel_size=config.kernel_size, problem_type=config.problem_type, output_channels=output_channels, ds=config.ds, ae=config.ae, feature_number=config.feature_number, is_transconv=config.is_transconv)

criterion = torch.nn.BCEWithLogitsLoss() 

loaded_model = load_model(model, os.path.join(config.SAVE_DIR, '2_best_model_checkpoint.pth'))

In [16]:
all_preds, all_probabilities = predict(model, test_loader)

  return F.conv1d(input, weight, bias, self.stride,


In [17]:
plot_probabilities_for_all_trials(all_probabilities, all_preds, class_names, save_dir=test_folder)

Plotting probabilities: 100%|██████████| 10/10 [00:05<00:00,  1.68plot/s]
