## Data Processing Summary

* Rename columns
* Focus on acceleration and ignore the angular velocity (for now)
* Smoothen the values to help reduce noise
* Calculate the acceleration vector magnitude
* Focusing on readings near the acceleration peak
* Scale the data
* Create a training and testing datasets
    * Each row would contain the acceleration magnitudes for a single throw (file)
    * Append the radar gun reading to the observations
    * Optional: Add wrist velocities

In [1]:
import joblib as jb
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from typing import Optional, List, Dict, Union

from scipy.integrate import cumtrapz # intergrate trapezoidal rule

# graph styling
%matplotlib inline 
plt.style.use('seaborn-dark')
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 16

In [2]:
radar_gun = pd.read_csv('Speed-Filenames.csv')
radar_gun = radar_gun.sort_values(by=['Speed']).reset_index(drop=True)

In [5]:
def rename_observation_columns(df: pd.DataFrame, columns: Optional[Dict] = {
        "Time_s_": "time", 
        "Acc_x_m_s_2_": "acc_x",
        "Acc_y_m_s_2_": "acc_y",
        "Acc_z_m_s_2_": "acc_z",
        "Gyro_x_1_s_": "gyro_x",
        "Gyro_y_1_s_": "gyro_y",
        "Gyro_z_1_s_": "gyro_z",
    }) -> pd.DataFrame:
    '''
    Rename the column names in the individual throws dataframes
    Note: Specific to this kaggle dataset columns
    inputs:
        df: dataframe, raw imu file data
    '''    
    return df.rename(columns=columns)

def filter_columns(df: pd.DataFrame, regex: Optional[str] = 'acc.*') -> pd.DataFrame:
    '''
    Keeps only the acceleration columns (start with acc)
    inputs:
        df: dataframe, renamed columns raw imu file data
        regex: string, a regular expression 'acc.*' by default
    '''
    return df.filter(regex=regex)

def rolling_average(vector: np.array, window_width: Optional[int] = 151) -> np.array:
    '''
    Calculate the rolling average of a vector
    input:
        vector: numpy array, input to calculate the moving average
        window_with: integer, defines how many units in the vector to average together 151 items by default
    '''
    cumsum_vec = np.cumsum(np.insert(vector, 0, 0))
    ma_vec = (cumsum_vec[window_width:] - cumsum_vec[:-window_width]) / window_width
    return ma_vec

def calculate_vector_magnitude(df: pd.DataFrame, 
                               x_vector: Optional[str] = 'acc_x',
                               y_vector: Optional[str] = 'acc_y',
                               z_vector: Optional[str] = 'acc_z') -> np.array:
    '''
    Calculate the magnitude of a 3D vector (x, y, z axis)
    The square root the sum of the squared vector values
    sqrt(Ax^2 + Ay^2 + Az^2)
    input:
        df: df: dataframe, renamed columns raw imu file data
        x_vector: string, x axis column name 'acc_x' by default
        y_vector: string, y axis column name 'acc_y' by default
        z_vector: string, z axis column name 'acc_z' by default    
    '''
    
    # square the vectors
    x_squared = df[x_vector] ** 2
    y_squared = df[y_vector] ** 2
    z_squared = df[z_vector] ** 2
    
    # square root the sum of squared values (magnitude) 
    return np.sqrt(sum([x_squared + y_squared + z_squared]))

def identify_maximum_acceleration(df: pd.DataFrame, 
                                  numeric_column: Optional[str] = 'acc_x') -> pd.DataFrame:
    '''
    Slices a dataframe to just 100 observations before, and after the maximum value of a numeric value
    input:
        df: dataframe, renamed columns raw imu file data
        numeric_column: string, numeric values column name 'acc_x' by default
    '''
    # find the index for the maximum value in the numeric column
    maximum_idx = df[numeric_column].idxmax()
    
    # capture 100 frames before and after the maximum value
    maximum = df.loc[maximum_idx - 100: maximum_idx + 100]
    return maximum

def intergrate_column(df: pd.DataFrame, 
                      columns_to_integrate: Optional[List[str]] = ['acc_x', 'acc_y', 'acc_z'],
                      integrated_columns_names: Optional[List[str]] = ['veloc_x', 'veloc_y', 'veloc_z'],
                      x_column: Optional[str] = 'time') -> pd.DataFrame:
    '''
    Intergrate using the trapezoidal rule the acceleration of the wrist to produce the velocity
    input:
        df: dataframe, renamed columns raw imu file data
        columns_to_integrate: array, list containing columns to integrate
        integrated_columns_names: array, list containing names for the new integrated columns
        x_column: string, the x axis column name to produce the acceleration function
    '''
    assert len(integrated_columns_names) == len(columns_to_integrate), \
        "The new columns name array should be the same length as the columns to integerate array"   
    
    for i in range(len(integrated_columns_names)):
        df[integrated_columns_names[i]] = np.append(0.0, cumtrapz(df[columns_to_integrate[i]], x=df[x_column])) 
    return df

def scale_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    '''
    
    '''
    # refrence column names 
    column_names = df.columns.tolist()
    
    # instanciate scaler and fit to the dataframe
    scaler = StandardScaler()  
    scaled_features = scaler.fit_transform(df)
    
    # create a new dataframe with the scaled features
    scaled_df = pd.DataFrame(scaled_features, columns=column_names, index=df.index)
    return scaled_df

def process_data(dictionary_df: pd.DataFrame, path_to_data: str) -> None:
    filenames = dictionary_df['Filename'].to_numpy()
    speeds = dictionary_df['Speed'].to_numpy()
    first_file = True
    
    try:
        for i, filename in enumerate(filenames):
            path = f'{path_to_data}{filename}.txt'
            data = pd.read_csv(path)

            # rename column names
            proccessed_data = rename_observation_columns(data)

            # calculate the velocities
            proccessed_data = intergrate_column(proccessed_data)

            # drop gyroscope data
            gryo_mask = proccessed_data.columns.str.startswith('gyro')
            gyro_columns = proccessed_data.columns[gryo_mask]
            proccessed_data = proccessed_data.drop(gyro_columns, axis=1)

            # smooth the imu readings by calculating the rolling average
            proccessed_data = proccessed_data.apply(lambda column: rolling_average(column.to_numpy()))

            # find the maximum value on x axis acceleration and slice 100 frames before and after that point
            proccessed_data = identify_maximum_acceleration(proccessed_data).reset_index(drop=True)

            # calculate acceleration magnitude 
            proccessed_data['acceleration_magnitude'] = calculate_vector_magnitude(proccessed_data)

            # calculate acceleration magnitude 
            proccessed_data['velocity_magnitude'] = calculate_vector_magnitude(proccessed_data,
                                                                               'veloc_x', 
                                                                               'veloc_y', 
                                                                               'veloc_z')

            # keep only magnitude columns
            mag_mask = proccessed_data.columns.str.endswith('magnitude')
            magnitude_columns = proccessed_data.columns[mag_mask]
            proccessed_data = proccessed_data.loc[:, magnitude_columns]

            # create a standard scaled version of the data
            proccessed_data = scale_dataframe(proccessed_data)

            # rehsape the values into a single row and re-create the dataframe
            column_names = [f'acceleration_{i}' if i < 201 else f'velocity_{i-201}' for i in range(len(proccessed_data) * 2)]
            flattened_values = proccessed_data.to_numpy().flatten().reshape(1, -1)

            # reshpe the dataframe into a single row
            proccessed_data = pd.DataFrame(flattened_values, columns=column_names)
            proccessed_data['speed'] = speeds[i]

            # append the row data into a new document
            proccessed_data.to_csv('magnitudes.csv', mode='a', index=False, header=first_file)
            first_file = False
        
    except Exception as e:
        print(str(e))
                      
    return proccessed_data

path = 'raw_imu_data/'     
df = process_data(radar_gun, path)