In [1]:
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv()

data_path = os.getenv('RAW_DATA_PATH')

categories = {'idle': 'idle',
              'running': 'run',
              'stairs': 'stair',
              'walking': 'walk'}

In [2]:
import re

def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)]

def get_col_names():
    col_names = []

    for i in range(1, 31):
        for l in ['x', 'y', 'z']:
            col_names.append(f'{l}_{i}')
            
    return col_names

In [3]:
dataframes = []

for category, label in categories.items():
    category_path = os.path.join(data_path, category)
    
    for sample_num, file in enumerate(sorted(os.listdir(category_path), key=natural_sort_key)):
        
        file_path = os.path.join(category_path, file)
        raw_df = pd.read_csv(file_path)
        temp_df = pd.DataFrame([raw_df.values.flatten()], columns=get_col_names())
        temp_df['action'] = label
        
        dataframes.append(temp_df)

df = pd.concat(dataframes, ignore_index=True)

In [4]:
from scipy.stats import entropy, skew, kurtosis
import numpy as np

def calculate_mean(df, sensor):
    return df[[f'{sensor}_{i}' for i in range(1, 31)]].mean(axis=1)

def calculate_variance(df, sensor):
    return df[[f'{sensor}_{i}' for i in range(1, 31)]].var(axis=1)

def calculate_std_dev(df, sensor):
    return df[[f'{sensor}_{i}' for i in range(1, 31)]].std(axis=1)

def calculate_median(df, sensor):
    return df[[f'{sensor}_{i}' for i in range(1, 31)]].median(axis=1)

def calculate_range(df, sensor):
    return df[[f'{sensor}_{i}' for i in range(1, 31)]].max(axis=1) - df[[f'{sensor}_{i}' for i in range(1, 31)]].min(axis=1)

def calculate_max_value(df, sensor):
    return df[[f'{sensor}_{i}' for i in range(1, 31)]].max(axis=1)

def calculate_min_value(df, sensor):
    return df[[f'{sensor}_{i}' for i in range(1, 31)]].min(axis=1)

def calculate_rms(df, sensor):
    return np.sqrt((df[[f'{sensor}_{i}' for i in range(1, 31)]] ** 2).mean(axis=1))

def calculate_signal_magnitude_area(df, sensor):
    return np.abs(df[[f'{sensor}_{i}' for i in range(1, 31)]]).sum(axis=1)
    
def calculate_index_of_max_value(df, sensor):
    max_value_indices = df[[f'{sensor}_{i}' for i in range(1, 31)]].idxmax(axis=1)
    return max_value_indices.apply(lambda x: int(x.split('_')[1]))

def calculate_index_of_min_value(df, sensor):
    min_value_indices = df[[f'{sensor}_{i}' for i in range(1, 31)]].idxmin(axis=1)
    return min_value_indices.apply(lambda x: int(x.split('_')[1]))

def calculate_power(df, sensor):
    return (df[[f'{sensor}_{i}' for i in range(1, 31)]] ** 2).mean(axis=1)

def calculate_energy(df, sensor):
    return (df[[f'{sensor}_{i}' for i in range(1, 31)]] ** 2).sum(axis=1)

def calculate_entropy(df, sensor):
    return df[[f'{sensor}_{i}' for i in range(1, 31)]].apply(lambda x: entropy(x + np.abs(x.min()) + 1), axis=1)

def calculate_skewness(df, sensor):
    return df[[f'{sensor}_{i}' for i in range(1, 31)]].skew(axis=1)

def calculate_kurtosis(df, sensor):
    return df[[f'{sensor}_{i}' for i in range(1, 31)]].apply(kurtosis, axis=1)

def calculate_interquartile_range(df, sensor):
    return df[[f'{sensor}_{i}' for i in range(1, 31)]].quantile(0.75, axis=1) - df[[f'{sensor}_{i}' for i in range(1, 31)]].quantile(0.25, axis=1)

def calculate_mean_absolute_deviation(df, sensor):
    sensor_columns = [f'{sensor}_{i}' for i in range(1, 31)]
    mean_values = df[sensor_columns].mean(axis=1)
    return df[sensor_columns].subtract(mean_values, axis=0).abs().mean(axis=1)

for l in ['x', 'y', 'z']:
    df[f'{l}_mean'] = calculate_mean(df, l)
    df[f'{l}_variance'] = calculate_variance(df, l)
    df[f'{l}_std'] = calculate_std_dev(df, l)
    df[f'{l}_median'] = calculate_median(df, l)
    df[f'{l}_range'] = calculate_range(df, l)
    df[f'{l}_max_value'] = calculate_max_value(df, l)
    df[f'{l}_min_value'] = calculate_min_value(df, l)
    df[f'{l}_rms'] = calculate_rms(df, l)
    df[f'{l}_signal_magnitude_area'] = calculate_signal_magnitude_area(df, l)
    df[f'{l}_index_max_value'] = calculate_index_of_max_value(df, l)
    df[f'{l}_index_min_value'] = calculate_index_of_min_value(df, l)
    df[f'{l}_power'] = calculate_power(df, l)
    df[f'{l}_energy'] = calculate_energy(df, l)
    df[f'{l}_entropy'] = calculate_entropy(df, l)
    df[f'{l}_skewness'] = calculate_skewness(df, l)
    df[f'{l}_kurtosis'] = calculate_kurtosis(df, l)
    df[f'{l}_interquartile_range'] = calculate_interquartile_range(df, l)
    df[f'{l}_mean_absolute_deviation'] = calculate_mean_absolute_deviation(df, l)

In [16]:
def calculate_row_cross_correlation(row, sensor1, sensor2):
    sensor1_values = row[[f'{sensor1}_{i}' for i in range(1, 31)]].values
    sensor2_values = row[[f'{sensor2}_{i}' for i in range(1, 31)]].values
    return pd.Series(sensor1_values).corr(pd.Series(sensor2_values))

for pair in [('x', 'y'), ('y', 'z'), ('z', 'x')]:
    sensor1, sensor2 = pair
    df[f'cross_corr_{sensor1}_{sensor2}'] = df.apply(lambda row: calculate_row_cross_correlation(row, sensor1, sensor2), axis=1)

In [6]:
df['label'] = pd.factorize(df['action'])[0]
df.to_csv('complete_ds.csv', index=False)