In [1]:
import numpy as np
import pandas as pd
import polars as pl
import torch
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

In [2]:
sampling_rate = 200  # Hz
max_time = 50        # sec
max_length = sampling_rate * max_time  # length of the sequence
window_sizes = [0.3, 0.6, 1.2]  # 초 단위 윈도우 크기

# Make data

In [7]:
class TimeSeriesFeatureEngineer:
    def __init__(self, window_sizes, sampling_rate, max_length):
        self.window_sizes = np.dot(window_sizes, sampling_rate).astype(int)
        self.encoder = None
        self.max_length = max_length
        self.label_mapping = {
            'idle': 'walk',
            'rampascent': 'rampascent',
            'rampascent-walk': 'rampascent',
            'rampdescent': 'rampdescent',
            'rampdescent-walk': 'rampdescent',
            'stairascent': 'stairascent',
            'stairascent-walk': 'stairascent',
            'stairdescent': 'stairdescent',
            'stairdescent-walk': 'stairdescent',
            'stand': 'walk',
            'stand-walk': 'walk',
            'turn1': 'walk',
            'turn2': 'walk',
            'walk': 'walk',
            'walk-rampascent': 'rampascent',
            'walk-rampdescent': 'rampdescent',
            'walk-stairascent': 'stairascent',
            'walk-stairdescent': 'stairdescent',
            'walk-stand': 'walk'
        }

    def map_labels(self, Y_data):
        Y_data_mapped = []
        for y_seq in Y_data:
            Y_data_mapped.append(np.array([self.label_mapping[label] for label in y_seq]))
        return Y_data_mapped

    def fit_transform_labels(self, Y_data):
        # 라벨 매핑
        Y_data_mapped = self.map_labels(Y_data)
        
        # 전체 라벨 수집
        all_labels = np.concatenate(Y_data_mapped)
        all_labels_unique = np.unique(all_labels).reshape(-1, 1)
        
        # OneHotEncoder를 사용하여 라벨 인코딩
        self.encoder = OneHotEncoder(sparse_output=False)
        self.encoder.fit(all_labels_unique)
        
        # 각 Y_data를 원핫 인코딩
        Y_data_encoded_list = [self.encoder.transform(np.array(y).reshape(-1, 1)) for y in Y_data_mapped]
        return Y_data_encoded_list

    def transform_labels(self, Y_data):
        # 라벨 매핑
        Y_data_mapped = self.map_labels(Y_data)
        
        # 각 Y_data를 원핫 인코딩
        Y_data_encoded_list = [self.encoder.transform(np.array(y).reshape(-1, 1)) for y in Y_data_mapped]
        return Y_data_encoded_list

    def feature_engineering(self, df: pl.DataFrame):
        # LazyFrame으로 변환하여 작업
        lf = df.lazy()
        
        for col in df.columns:
            for window in self.window_sizes:
                window_str = str(window)
                # 통계 값
                lf = lf.with_columns([
                    df[col].rolling_mean(window).alias(col + '_mean_' + window_str),
                    df[col].rolling_std(window).alias(col + '_std_' + window_str),
                    df[col].rolling_min(window).alias(col + '_min_' + window_str),
                    df[col].rolling_max(window).alias(col + '_max_' + window_str),
                    df[col].diff(window).alias(col + '_diff_' + window_str),
                    df[col].rolling_mean(window).alias(col + '_ma_' + window_str),
                    df[col].rolling_std(window).alias(col + '_stddev_' + window_str)
                ])
                for lag in range(1, 4):
                    lf = lf.with_columns([
                        df[col].shift(lag * window).alias(col + f'_lag_{lag}_' + window_str)
                    ])
        
        features_df = lf.collect().fill_nan(0).fill_null(0)
        return features_df

    def fit_transform_features(self, X_data):
        X_features = []
        for seq in X_data:
            seq_df = pl.DataFrame(seq)
            features_df = self.feature_engineering(seq_df)
            X_features.append(features_df.to_numpy())
        return X_features

    def pad_or_trim_sequences(self, sequences):
        padded_sequences = []
        for seq in sequences:
            if len(seq) > self.max_length:
                padded_sequences.append(torch.tensor(seq[:self.max_length], dtype=torch.float32))
            else:
                padding_length = self.max_length - len(seq)
                padded_seq = np.pad(seq, ((0, padding_length), (0, 0)), 'constant', constant_values=0)
                padded_sequences.append(torch.tensor(padded_seq, dtype=torch.float32))
        return torch.stack(padded_sequences)

    def pad_or_trim_labels(self, sequences):
        padded_labels = []
        for seq in sequences:
            if len(seq) > self.max_length:
                padded_labels.append(torch.tensor(seq[:self.max_length], dtype=torch.float32))
            else:
                padding_length = self.max_length - len(seq)
                padded_seq = np.pad(seq, ((0, padding_length), (0, seq.shape[1])), 'constant', constant_values=0)
                padded_labels.append(torch.tensor(padded_seq, dtype=torch.float32))
        return torch.stack(padded_labels)

    def fit(self, X_data, Y_data, batch_size=100, temp_dir="temp_batches"):
        os.makedirs(temp_dir, exist_ok=True)

        num_batches = len(X_data) // batch_size + (1 if len(X_data) % batch_size != 0 else 0)

        for batch_idx in tqdm(range(num_batches), desc="Processing Batches", unit="batch"):
            start_idx = batch_idx * batch_size
            end_idx = min((batch_idx + 1) * batch_size, len(X_data))
            X_batch = X_data[start_idx:end_idx]
            Y_batch = Y_data[start_idx:end_idx]

            # 라벨 인코딩
            Y_data_encoded_list = self.fit_transform_labels(Y_batch)
            
            # 특징 공학
            X_features = self.fit_transform_features(X_batch)
            
            # 배치 크기에 맞춘 패딩 적용
            X_data_padded = self.pad_or_trim_sequences(X_features)
            Y_data_padded = self.pad_or_trim_labels(Y_data_encoded_list)

            # 배치 단위로 저장
            torch.save(X_data_padded, os.path.join(temp_dir, f"X_data_padded_batch_{batch_idx}.pth"))
            torch.save(Y_data_padded, os.path.join(temp_dir, f"Y_data_padded_batch_{batch_idx}.pth"))

        # 저장된 배치를 불러와서 합치기
        X_data_padded_list = []
        Y_data_padded_list = []
        for batch_idx in tqdm(range(num_batches), desc="Loading Batches", unit="batch"):
            X_data_padded = torch.load(os.path.join(temp_dir, f"X_data_padded_batch_{batch_idx}.pth"))
            Y_data_padded = torch.load(os.path.join(temp_dir, f"Y_data_padded_batch_{batch_idx}.pth"))
            X_data_padded_list.append(X_data_padded)
            Y_data_padded_list.append(Y_data_padded)

        X_data_padded = torch.cat(X_data_padded_list, dim=0)
        Y_data_padded = torch.cat(Y_data_padded_list, dim=0)

        return X_data_padded, Y_data_padded

    def transform(self, X_data, batch_size=100, temp_dir="temp_batches"):
        os.makedirs(temp_dir, exist_ok=True)

        num_batches = len(X_data) // batch_size + (1 if len(X_data) % batch_size != 0 else 0)

        for batch_idx in tqdm(range(num_batches), desc="Processing Batches", unit="batch"):
            start_idx = batch_idx * batch_size
            end_idx = min((batch_idx + 1) * batch_size, len(X_data))
            X_batch = X_data[start_idx:end_idx]

            # 특징 공학
            X_features = self.fit_transform_features(X_batch)
            
            # 배치 크기에 맞춘 패딩 적용
            X_data_padded = self.pad_or_trim_sequences(X_features)

            # 배치 단위로 저장
            torch.save(X_data_padded, os.path.join(temp_dir, f"X_data_padded_batch_{batch_idx}.pth"))

        # 저장된 배치를 불러와서 합치기
        X_data_padded_list = []
        for batch_idx in tqdm(range(num_batches), desc="Loading Batches", unit="batch"):
            X_data_padded = torch.load(os.path.join(temp_dir, f"X_data_padded_batch_{batch_idx}.pth"))
            X_data_padded_list.append(X_data_padded)

        X_data_padded = torch.cat(X_data_padded_list, dim=0)

        return X_data_padded

# Load Data

In [8]:
# 데이터 불러오기
X_data = np.load('X_data.npy', allow_pickle=True)
Y_data = np.load('Y_data.npy', allow_pickle=True)

print('X_data shape:', X_data.shape)
print('Y_data shape:', Y_data.shape)

X_data shape: (2990,)
Y_data shape: (2990,)


# Feature Engineering

In [9]:
feature_engineer = TimeSeriesFeatureEngineer(window_sizes, sampling_rate, max_length)


In [10]:
# 학습 데이터에 대해 특징 공학 및 패딩 적용
X_data_padded, Y_data_padded = feature_engineer.fit(X_data, Y_data, batch_size=100)

Processing Batches: 100%|██████████| 30/30 [08:03<00:00, 16.13s/batch]
Loading Batches: 100%|██████████| 30/30 [01:56<00:00,  3.89s/batch]


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 88982400000 bytes.

# Split X & Y

In [None]:
# 학습 및 검증 데이터셋 나누기
X_train, X_val, Y_train, Y_val = train_test_split(X_data_padded, Y_data_padded, test_size=0.2, random_state=42)

print('X_train shape:', X_train.shape)
print('Y_train shape:', Y_train.shape)
print('X_val shape:', X_val.shape)
print('Y_val shape:', Y_val.shape)

In [None]:
# 텐서 데이터셋 생성
train_dataset = TensorDataset(X_train, Y_train)
val_dataset = TensorDataset(X_val, Y_val)

In [None]:
# 데이터셋 저장 경로 설정
train_dataset_path = 'train_dataset.pth'
val_dataset_path = 'val_dataset.pth'

# 데이터셋 저장
torch.save(train_dataset, train_dataset_path)
torch.save(val_dataset, val_dataset_path)

# 데이터셋 불러오기
loaded_train_dataset = torch.load(train_dataset_path)
loaded_val_dataset = torch.load(val_dataset_path)

# Data Loader

In [None]:
# 데이터로더 생성
batch_size = 64  # 배치 크기 설정
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print('Train loader length:', len(train_loader))
print('Validation loader length:', len(val_loader))

# Test Data

In [None]:
test_csv_file = 'test_data.csv'
test_data_df = pd.read_csv(test_csv_file)

# polars DataFrame으로 변환
X_test_data = pl.from_pandas(test_data_df)

# 특징 공학 및 패딩 적용
X_test_padded = feature_engineer.transform([X_test_data.to_numpy()])
print('X_test_padded shape:', X_test_padded.shape)

# 테스트 데이터셋 생성
test_dataset = TensorDataset(X_test_padded)

# 테스트 데이터로더 생성
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print('Test loader length:', len(test_loader))

# One-Hot Encoder

In [4]:
# 라벨 매핑 사전 정의
label_mapping = {
    'idle': 'walk',
    'rampascent': 'rampascent',
    'rampascent-walk': 'rampascent',
    'rampdescent': 'rampdescent',
    'rampdescent-walk': 'rampdescent',
    'stairascent': 'stairascent',
    'stairascent-walk': 'stairascent',
    'stairdescent': 'stairdescent',
    'stairdescent-walk': 'stairdescent',
    'stand': 'walk',
    'stand-walk': 'walk',
    'turn1': 'walk',
    'turn2': 'walk',
    'walk': 'walk',
    'walk-rampascent': 'rampascent',
    'walk-rampdescent': 'rampdescent',
    'walk-stairascent': 'stairascent',
    'walk-stairdescent': 'stairdescent',
    'walk-stand': 'walk'
}

# Label mapping:
# 0: idle
# 1: rampascent
# 2: rampascent-walk
# 3: rampdescent
# 4: rampdescent-walk
# 5: stairascent
# 6: stairascent-walk
# 7: stairdescent
# 8: stairdescent-walk
# 9: stand
# 10: stand-walk
# 11: turn1
# 12: turn2
# 13: walk
# 14: walk-rampascent
# 15: walk-rampdescent
# 16: walk-stairascent
# 17: walk-stairdescent
# 18: walk-stand

In [5]:
# Y_data 매핑 적용
Y_data_mapped = []
for y_seq in Y_data:
    Y_data_mapped.append(np.array([label_mapping[label] for label in y_seq]))


In [6]:
# 전체 라벨 수집
all_labels = np.concatenate(Y_data_mapped)
all_labels_unique = np.unique(all_labels).reshape(-1, 1)

In [7]:
# OneHotEncoder를 사용하여 라벨 인코딩
encoder = OneHotEncoder(sparse_output=False)
encoder.fit(all_labels_unique)

# 원핫 인코딩된 클래스와 원래 라벨 간의 매칭 출력
print("Label mapping:")
for i, label in enumerate(encoder.categories_[0]):
    print(f"{i}: {label}")

Label mapping:
0: rampascent
1: rampdescent
2: stairascent
3: stairdescent
4: walk


In [8]:
# 각 Y_data를 원핫 인코딩
Y_data_encoded_list = [encoder.transform(np.array(y).reshape(-1, 1)) for y in Y_data_mapped]

# Feature Engineering

In [9]:
# 특징 공학 함수 정의
def feature_engineering(df: pl.DataFrame, window_sizes):
    # LazyFrame으로 변환하여 작업
    lf = df.lazy()
    
    for col in df.columns:
        for window in window_sizes:
            window_int = int(window)  # numpy.float64를 int로 변환
            window_str = str(window_int)
            # 통계 값
            lf = lf.with_columns([
                df[col].rolling_mean(window_int).alias(col + '_mean_' + window_str),
                df[col].rolling_std(window_int).alias(col + '_std_' + window_str),
                df[col].rolling_min(window_int).alias(col + '_min_' + window_str),
                df[col].rolling_max(window_int).alias(col + '_max_' + window_str),
                df[col].diff(window_int).alias(col + '_diff_' + window_str),
                df[col].rolling_mean(window_int).alias(col + '_ma_' + window_str),
                df[col].rolling_std(window_int).alias(col + '_stddev_' + window_str)
            ])
            for lag in range(1, 4):
                lf = lf.with_columns([
                    df[col].shift(lag * window_int).alias(col + f'_lag_{lag}_' + window_str)
                ])
    
    features_df = lf.collect().fill_nan(0).fill_null(0)
    return features_df

In [10]:
# 시퀀스별로 특징 공학 적용
window_sizes = np.dot([0.3, 0.6, 1.2], sampling_rate)    # 0.3 sec, 0.6 sec, 1.2 sec

In [11]:
X_features = []
for seq in X_data:
    seq_df = pl.DataFrame(seq)
    features_df = feature_engineering(seq_df, window_sizes)
    X_features.append(features_df.to_numpy())
    print(f'{len(X_features)}', end='\r')

Original X_data shape: (2990,)


AttributeError: 'list' object has no attribute 'shape'

# Set Padding

In [69]:
# 각 시퀀스의 길이 계산
sequence_lengths = [len(seq) for seq in X_data]

# 최대 길이 찾기
max_length = max(sequence_lengths)
min_length = min(sequence_lengths)
mean_length = np.mean(sequence_lengths)

print(f'Max sequence length: {max_length}')
print(f'Min sequence length: {min_length}')
print(f'Mean sequence length: {mean_length}')


Max sequence length: 9837
Min sequence length: 2001
Mean sequence length: 3316.9685618729095


In [71]:
# 패딩 적용 함수 정의
def pad_or_trim_sequences(sequences, max_length=5000):
    padded_sequences = []
    for seq in sequences:
        if len(seq) > max_length:
            padded_sequences.append(torch.tensor(seq[:max_length], dtype=torch.float32))
        else:
            padding_length = max_length - len(seq)
            padded_seq = np.pad(seq, ((0, padding_length), (0, 0)), 'constant', constant_values=0)
            padded_sequences.append(torch.tensor(padded_seq, dtype=torch.float32))
    return torch.stack(padded_sequences)

def pad_or_trim_labels(sequences, max_length=5000):
    padded_labels = []
    for seq in sequences:
        if len(seq) > max_length:
            padded_labels.append(torch.tensor(seq[:max_length], dtype=torch.float32))
        else:
            padding_length = max_length - len(seq)
            padded_seq = np.pad(seq, ((0, padding_length), (0, seq.shape[1])), 'constant', constant_values=0)
            padded_labels.append(torch.tensor(padded_seq, dtype=torch.float32))
    return torch.stack(padded_labels)

In [74]:
X_data_padded = pad_or_trim_sequences(X_features, max_length)
Y_data_padded = pad_or_trim_labels(Y_data_encoded_list, max_length)

print('X_data_padded shape:', X_data_padded.shape)
print('Y_data_padded shape:', Y_data_padded.shape)

X_data_padded shape: torch.Size([2990, 10000, 24])
Y_data_padded shape: torch.Size([2990, 10000, 10])


# Make Train & Val

In [None]:
# 학습 및 검증 데이터셋 나누기
X_train, X_val, Y_train, Y_val = train_test_split(X_data, Y_data, test_size=0.2, random_state=42)

print('X_train shape:', X_train.shape)
print('Y_train shape:', Y_train.shape)
print('X_val shape:', X_val.shape)
print('Y_val shape:', Y_val.shape)

In [33]:
# numpy 배열을 PyTorch 텐서로 변환
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
Y_val_tensor = torch.tensor(Y_val, dtype=torch.long)

# 텐서 데이터셋 생성
train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, Y_val_tensor)

# 데이터로더 생성
batch_size = 64  # 배치 크기 설정
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

print('Train loader length:', len(train_loader))
print('Validation loader length:', len(val_loader))