In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from math import ceil
from scipy import stats

from sklearn.preprocessing import StandardScaler

from keras.utils import to_categorical

import time

# Functions

## Frequency stability

In [2]:
def get_avg_period(df_arg, column_name_arg):
    """Function for finding the average value of the measurement period
    :param df_arg: researched dataframe
    :param column_name_arg: the name of the column that contains information about the time of the measurements
    (e.g., the 'time' column)
    :return: the average value of the measurement period
    """
    return df_arg[column_name_arg].diff().mean()


def get_avg_frequency(df_arg, column_name_arg):
    """Function for finding the average value of the measurement frequency
        :param df_arg: researched dataframe
        :param column_name_arg: the name of the column that contains information about the time of the measurements
        (e.g., the 'time' column)
        :return: the average value of the measurement frequency
        """
    time_diffs = df_arg[column_name_arg].diff()
    return 1.0 / time_diffs.mean()

## Data Filtering

In [3]:
def median_filter_data(df_arg, filter_columns_arg, window_size_arg):
    """Function to filter data using a median filter with a selected window of the specified columns of the transmitted dataframe
    :param df_arg: the dataframe whose column contents you want to filter
    :param filter_columns_arg: a list of columns whose contents should be filtered
    :param window_size_arg: window size for the median filter
    :return: a dataframe that contains the filtered column values
    """
    df = pd.DataFrame()
    for column in filter_columns_arg:
        # Apply median filtering to column data
        df[f'{column}_filtered'] = df_arg[column].rolling(window=window_size_arg, center=True, min_periods=1).median()
    return df

## Exploratory data analysis

In [4]:
def get_undersampled_df(df_arg, column_name_arg):
    # First, calculate the minimum number of samples across all classes
    min_samples = df_arg[column_name_arg].value_counts().min()

    # Initialize an empty DataFrame to store the undersampled data
    undersampled_df = pd.DataFrame()

    # Loop through each unique activity class and select the first min_samples for each class
    for activity_class in df_arg[column_name_arg].unique():
        class_subset = df_arg[df_arg[column_name_arg] == activity_class].iloc[:min_samples]
        undersampled_df = pd.concat([undersampled_df, class_subset])

    return undersampled_df

In [5]:
def get_discard_columns(corr_matrix_arg, important_columns_arg, df_arg):

    columns_to_discard = set()
    for column in corr_matrix_arg.columns:
        correlated_columns = corr_matrix_arg.index[
            (corr_matrix_arg[column] > 0.5) | (corr_matrix_arg[column] < -0.5)
            ]

        for correlated_column in correlated_columns:
            if column != correlated_column:
                # Prioritize which column to keep based on your criteria
                # For example, keep the column with higher variance
                if column not in important_columns_arg:
                    columns_to_discard.add(column)
                elif (column in important_columns_arg) and (correlated_column in important_columns_arg):
                    pass
                elif (column in important_columns_arg) and (correlated_column not in important_columns_arg):
                    columns_to_discard.add(correlated_column)
                else:  # both columns are not in important_columns
                    columns_to_discard.add(
                        correlated_column if df_arg[correlated_column].var() < df_arg[column].var() else column)

    return columns_to_discard

## Windowing

In [6]:
def get_windowed_df(df_arg, window_duration_arg):
    # Calculate the number of data points within a 2-second window
    sampling_frequency = 1.0 / df_arg['time'].diff().mean()  # Hz
    window_size = ceil(sampling_frequency * window_duration_arg)
    step_size = window_size // 2

    # Create a list to store the windowed dataframes
    windowed_dfs = []

    windowed_dict = {'accX': [], 'accY': [], 'accZ': [], 'gyrZ': [], 'activity': []}
    # Divide the entire dataframe into 2-second windows
    for i in range(0, len(df_arg), step_size):
        window_df = df_arg.iloc[i:i + window_size]

        windowed_dict['accX'].append(window_df['accX_filtered'].values)
        windowed_dict['accY'].append(window_df['accY_filtered'].values)
        windowed_dict['accZ'].append(window_df['accZ_filtered'].values)

        windowed_dict['gyrZ'].append(window_df['gyrZ_filtered'].values)

        # Determine the most frequent activity in the window
        most_frequent_activity = window_df['activity'].value_counts().idxmax()
        # Assign the most frequent activity to all rows in the window
        windowed_dict['activity'].append(most_frequent_activity)

        windowed_dfs.append(window_df)

    return pd.DataFrame.from_dict(windowed_dict)

## Feature_engineering

In [7]:
def get_statistical_measures_df(windowed_data_df, functions, data_df_columns, result_df_columns):
    """Function for creating a dataframe X_df, the columns of which correspond to the required statistical measures for
    the windows of the windowed_data_df dataframe
    :param windowed_data_df: a dataframe whose rows contain arrays of data formed as a result of windowing
    :param functions: a list of references to lambda functions that will calculate the required statistical measures
    :param data_df_columns: a list of column names of the windowed_data_df dataframe for which to find statistical
    measures
    :param result_df_columns: a list of names of the searched statistical measures of the output dataframe
    :return: X_train dataframe
    An example of using the function:
        Let's imagine we have a windowed_data_df dataframe (a dataframe formed as a result of windowing - that is, each record in a row can be an array of records for a given window), which contains the columns 'accX', 'accY', 'accZ' (the results of measuring the readings of the accelerometer on the corresponding axes).
        Inside the function, we create an X_df dataframe to which we want to add new columns with statistical measure values.
        For example, we want to calculate the statistical mean and avg absolute diff for the accelerometer readings on all three axes (the columns 'accX', 'accY', 'accZ' of data_df), so the columns of the X_df dataframe will be named, for example, 'accX_mean', 'accY_mean', ..., 'accZ_aad'.
        These values can be calculated using the lambda functions: [
            lambda x: x.mean(),
            lambda x: np.mean(np.absolute(x - np.mean(x)))
        ]
        So, the function call will look like this:
        get_statistical_measures_df(df=data_df,
                                    functions=[
                                        lambda x: x.mean(),
                                        lambda x: np.mean(np.absolute(x - np.mean(x)))
                                    ],
                                    data_df_columns=['accX', 'accY', 'accZ'],
                                    result_df_columns=['mean', 'aad'])
    """
    X_df = pd.DataFrame()
    for [function, res_column] in zip(functions, result_df_columns):
        for data_column in data_df_columns:
            X_df[f'{data_column}_{res_column}'] = windowed_data_df[data_column].apply(function)
    return X_df

In [8]:
def perform_feature_engineering(df_arg, functions_list):
    y_train = df_arg['activity'].values
    result_columns = ['mean', 'std', 'aad', 'min', 'max', 'range', 'median', 'iqr', 'neg_count', 'pos_count',
                      'assymetry', 'kurtosis']
    df_arg = get_statistical_measures_df(windowed_data_df=df_arg,
                                                             functions=functions_list,
                                                             data_df_columns=['accX', 'accY', 'accZ', 'gyrZ'],
                                                             result_df_columns=result_columns)
    df_arg['activity'] = y_train
    return df_arg

## Model Training Data Preparation

In [9]:
def split_train_data(train_df_arg, training_part=0.8):
    X_train = pd.DataFrame()
    y_train = []

    X_valid = pd.DataFrame()
    y_valid = []

    counter = 0

    for activity in train_df_arg['activity'].unique():
        activity_data = train_df_arg[train_df_arg['activity'] == activity].copy()
        activity_data.reset_index(inplace=True)
        activity_data.drop('index', axis=1, inplace=True)
        split_index = int(training_part * len(activity_data))
        if counter != 0:
            X_train = pd.concat([X_train, activity_data[activity_data.columns[:-2]][:split_index]])
            X_valid = pd.concat([X_valid, activity_data[activity_data.columns[:-2]][split_index:]])
        else:
            X_train = activity_data[activity_data.columns[:-2]][:split_index]
            X_valid = activity_data[activity_data.columns[:-2]][split_index:]

        y_train.extend(list(activity_data['activity_number'].values[:split_index]))
        y_valid.extend(list(activity_data['activity_number'].values[split_index:]))

        counter += 1

    return [X_train, y_train, X_valid, y_valid]

In [10]:
def prepare_target_features(y_arg, one_hot_encoding):
    y_arg = np.array(y_arg)
    if one_hot_encoding:
        # Convert Label Encoded target data to one-hot encoded format
        y_arg = to_categorical(y_arg)
    return y_arg

In [11]:
def model_training_data_preparation(df_arg):
    # Convert string labels to int
    activity_dict = {'Squat': 0, 'Leg land': 1, 'Walk': 2, 'Lateral squat slide': 3, 'Jogging': 4}
    df_arg['activity_number'] = df_arg['activity'].apply(lambda x: activity_dict[x])

    X_train, y_train, X_valid, y_valid = split_train_data(train_df_arg=df_arg)
    y_train = prepare_target_features(y_arg=y_train, one_hot_encoding=True)
    y_valid = prepare_target_features(y_arg=y_valid, one_hot_encoding=True)
    # Scale feature vectors
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)
    return X_train, y_train, X_valid, y_valid

## Pipeline functions

### Original pipeline

In [12]:
def perform_pipeline(df_arg):
    # Exploring measurement period and frequency stability
    df_arg = df_arg[df_arg['time'].diff() <= get_avg_period(df_arg, 'time') * 1.5]

    # Data Filtering
    df_arg[['accX_filtered', 'accY_filtered', 'accZ_filtered', 'gyrX_filtered', 'gyrY_filtered', 'gyrZ_filtered']] \
        = median_filter_data(df_arg=df_arg,
                             filter_columns_arg=['accX', 'accY', 'accZ', 'gyrX', 'gyrY', 'gyrZ'],
                             window_size_arg=10)

    # Exploratory Data Analysis
    df_arg = df_arg[df_arg['activity'] != 'No activity']

    # Perform undersampling to get a balanced dataframe
    df_arg = get_undersampled_df(df_arg=df_arg, column_name_arg='activity')

    # Build a correlation matrix and remove certain axes of the accelerometer or gyroscope
    sel_columns = ['accX_filtered', 'accY_filtered', 'accZ_filtered', 'gyrX_filtered', 'gyrY_filtered',
                   'gyrZ_filtered']

    # Calculate the correlation matrix for the selected columns
    corr_matrix = df_arg[sel_columns].corr()
    important_columns = ['accX_filtered', 'accY_filtered', 'accZ_filtered']
    discard_columns = get_discard_columns(corr_matrix_arg=corr_matrix,
                                          important_columns_arg=important_columns,
                                          df_arg=df_arg)
    sel_columns = [col for col in sel_columns if col not in discard_columns]
    sel_columns.append('activity')
    filtered_df = df_arg[['time'] + sel_columns].copy()

    # Windowing
    windowed_df = get_windowed_df(df_arg=filtered_df, window_duration_arg=2)

    # Feature Engineering
    functions_list = [
        lambda x: x.mean(),  # mean
        lambda x: x.std(),  # std deviation
        lambda x: np.mean(np.absolute(x - np.mean(x))),  # avg absolute diff
        lambda x: x.min(),  # min
        lambda x: x.max(),  # max
        lambda x: x.max() - x.min(),  # range = max-min diff
        lambda x: np.median(x),  # median
        lambda x: np.percentile(x, 75) - np.percentile(x, 25),  # interquartile range
        lambda x: np.sum(x < 0),  # negative count
        lambda x: np.sum(x > 0),  # positive count
        lambda x: stats.skew(x),  # skewness = assymetry
        lambda x: stats.kurtosis(x)  # kurtosis
    ]
    windowed_df = perform_feature_engineering(df_arg=windowed_df, functions_list=functions_list)

    # Model training
    return model_training_data_preparation(df_arg=windowed_df)

## Estimation functions

In [13]:
def get_time_usage(df_arg):
    # Measure start time
    start_time = time.time()

    # Call the pipeline function
    X_train, y_train, X_valid, y_valid = perform_pipeline(df_arg=df_arg)

    # Measure end time
    end_time = time.time()
    total_time = end_time - start_time
    return total_time

In [14]:
def get_optimized_time_usage(df_arg):
    # Measure start time
    start_time = time.time()

    # Call the pipeline function
    X_train, y_train, X_valid, y_valid = perform_optimized_pipeline(df_arg=df_arg)

    # Measure end time
    end_time = time.time()
    total_time = end_time - start_time
    return total_time

In [15]:
def get_memory_usage(df_arg, pipeline_func):
    process = psutil.Process()
    
    # Measure memory before running the pipeline
    mem_before = process.memory_info().rss

    # Call the pipeline function
    pipeline_func(df_arg)

    # Measure memory after running the pipeline
    mem_after = process.memory_info().rss
    mem_used = mem_after - mem_before
    return mem_used

# Work with data

In [16]:
df = pd.read_csv('data/Train/Train_activities_1_2023-08-23.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63529 entries, 0 to 63528
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   timestamp  63529 non-null  int64  
 1   time       63529 non-null  float64
 2   accX       63529 non-null  float64
 3   accY       63529 non-null  float64
 4   accZ       63529 non-null  float64
 5   gyrX       63529 non-null  float64
 6   gyrY       63529 non-null  float64
 7   gyrZ       63529 non-null  float64
 8   activity   63529 non-null  object 
dtypes: float64(7), int64(1), object(1)
memory usage: 4.4+ MB


### Investigate original pipeline execution time

In [17]:
print(f"Original pipeline")
number_of_experiments = 5
time_list = []
for i in range(number_of_experiments):
    temp_time = get_time_usage(df_arg=df)
    time_list.append(temp_time)
    print(f"{i+1}) time = {temp_time:.3f} seconds")
    
print(f"average execution time = {sum(time_list) / len(time_list): .3f} seconds")

Original pipeline
1) time = 7.071 seconds
2) time = 6.937 seconds
3) time = 7.076 seconds
4) time = 6.956 seconds
5) time = 7.086 seconds
average execution time =  7.025 seconds
