## Data Processing

* Rename columns
* Focus on acceleration and ignore the angular velocity (for now)
* Smoothen the values to help reduce noise
* Calculate the acceleration vector magnitude
* Focusing on readings near the acceleration peak
* Scale the data
* Create a training and testing datasets
    * Each row would contain the acceleration magnitudes for a single throw (file)
    * Append the radar gun reading to the observations
    * Optional: Add wrist velocities

In [2]:
import os
import joblib as jb
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import glob
from typing import Optional, List, Dict, Union

from scipy.integrate import cumtrapz # intergrate trapezoidal rule
from scipy.signal import savgol_filter

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

# graph styling
%matplotlib inline 
plt.style.use('seaborn-dark')
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 16

In [3]:
radar_gun = pd.read_csv('Speed-Filenames.csv')
radar_gun = radar_gun.sort_values(by=['Speed']).reset_index(drop=True)

In [27]:
def rename_observation_columns(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Rename the column names in the individual throws dataframes
    Note: Specific to this kaggle dataset columns
    inputs:
        df: dataframe, raw imu file data
    '''
    return df.rename(columns={
        "Time_s_": "time", 
        "Acc_x_m_s_2_": "acc_x",
        "Acc_y_m_s_2_": "acc_y",
        "Acc_z_m_s_2_": "acc_z",
        "Gyro_x_1_s_": "gyro_x",
        "Gyro_y_1_s_": "gyro_y",
        "Gyro_z_1_s_": "gyro_z",
    })

def filter_acceleration_columns(df: pd.DataFrame, regex: Optional[str] = 'acc.*') -> pd.DataFrame:
    '''
    Keeps only the acceleration columns (start with acc)
    inputs:
        df: dataframe, renamed columns raw imu file data
        regex: string, a regular expression 'acc.*' by default
    '''
    return df.filter(regex=regex)

def rolling_average(vector: np.array, window_width: Optional[int] = 151) -> np.array:
    '''
    Calculate the rolling average of a vector
    input:
        vector: numpy array, input to calculate the moving average
        window_with: integer, defines how many units in the vector to average together 151 items by default
    '''
    cumsum_vec = np.cumsum(np.insert(vector, 0, 0))
    ma_vec = (cumsum_vec[window_width:] - cumsum_vec[:-window_width]) / window_width
    return ma_vec

def calculate_vector_magnitude(df: pd.DataFrame, 
                               x_vector: Optional[str] = 'acc_x',
                               y_vector: Optional[str] = 'acc_y',
                               z_vector: Optional[str] = 'acc_z') -> np.array:
    '''
    Calculate the magnitude of a 3D vector (x, y, z axis)
    The square root the sum of the squared vector values
    sqrt(Ax^2 + Ay^2 + Az^2)
    input:
        df: df: dataframe, renamed columns raw imu file data
        x_vector: string, x axis column name 'acc_x' by default
        y_vector: string, y axis column name 'acc_y' by default
        z_vector: string, z axis column name 'acc_z' by default    
    '''
    
    # square the vectors
    x_squared = df[x_vector] ** 2
    y_squared = df[y_vector] ** 2
    z_squared = df[z_vector] ** 2
    
    # square root the sum of squared values (magnitude) 
    return np.sqrt(sum([x_squared + y_squared + z_squared]))

def identify_maximum_acceleration(df: pd.DataFrame, 
                                  numeric_column: Optional[str] = 'acc_x') -> pd.DataFrame:
    '''
    Slices a dataframe to just 100 observations before, and after the maximum value of a numeric value
    input:
        df: dataframe, renamed columns raw imu file data
        numeric_column: string, numeric values column name 'acc_x' by default
    '''
    # find the index for the maximum value in the numeric column
    maximum_idx = df[numeric_column].idxmax()
    
    # capture 100 frames before and after the maximum value
    maximum = df.loc[maximum_idx - 100: maximum_idx + 100]
    return maximum

def intergrate_column(df: pd.DataFrame, 
                      columns_to_integrate: Optional[List[str]] = ['acc_x', 'acc_y', 'acc_z'],
                      integrated_columns_names: Optional[List[str]] = ['veloc_x', 'veloc_y', 'veloc_z'],
                      x_column: Optional[str] = 'time') -> pd.DataFrame:
    '''
    
    '''
    assert len(integrated_columns_names) == len(columns_to_integrate), \
        "The new columns name array should be the same length as the columns to integerate array"   
    
    for i in range(len(integrated_columns_names)):
        df[integrated_columns_names[i]] = np.append(0.0, cumtrapz(df[columns_to_integrate[i]], x=df[x_column])) 
    return df

def scale_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    '''
    
    '''
    # refrence column names 
    column_names = df.columns.tolist()
    
    # instanciate scaler and fit to the dataframe
    scaler = StandardScaler()  
    scaled_features = scaler.fit_transform(df)
    
    # create a new dataframe with the scaled features
    scaled_df = pd.DataFrame(scaled_features, columns=column_names, index=df.index)
    return scaled_df

In [79]:
def process_data(dictionary_df: pd.DataFrame, path_to_data: str) -> None:
    filenames = dictionary_df['Filename'].to_numpy()
    speeds = dictionary_df['Speed'].to_numpy()
    first_file = True
    
    for i, filename in enumerate(filenames):
        path = f'{path_to_data}{filename}.txt'
        data = pd.read_csv(path)
        
        # rename column names
        proccessed_data = rename_observation_columns(data)
        
        # calculate the velocities
        proccessed_data = intergrate_column(proccessed_data)
        
        # drop gyroscope data
        gryo_mask = proccessed_data.columns.str.startswith('gyro')
        gyro_columns = proccessed_data.columns[gryo_mask]
        proccessed_data = proccessed_data.drop(gyro_columns, axis=1)
        
        # smooth the imu readings by calculating the rolling average
        proccessed_data = proccessed_data.apply(lambda column: rolling_average(column.to_numpy()))
        
        # find the maximum value on x axis acceleration and slice 100 frames before and after that point
        proccessed_data = identify_maximum_acceleration(proccessed_data).reset_index(drop=True)
        
        # calculate acceleration magnitude 
        proccessed_data['acceleration_magnitude'] = calculate_vector_magnitude(proccessed_data)
        
        # calculate acceleration magnitude 
        proccessed_data['velocity_magnitude'] = calculate_vector_magnitude(proccessed_data,
                                                                           'veloc_x', 
                                                                           'veloc_y', 
                                                                           'veloc_z')
        
        # keep only magnitude columns
        mag_mask = proccessed_data.columns.str.endswith('magnitude')
        magnitude_columns = proccessed_data.columns[mag_mask]
        proccessed_data = proccessed_data.loc[:, magnitude_columns]
        
        # rehsape the values into a single row and re-create the dataframe
        column_names = [f'acceleration_{i}' if i < 201 else f'velocity_{i-201}' for i in range(len(proccessed_data) * 2)]
        flattened_values = proccessed_data.to_numpy().flatten().reshape(1, -1)
        
        # reshpe the dataframe into a single row
        proccessed_data = pd.DataFrame(flattened_values, columns=column_names)
        proccessed_data['speed'] = speeds[i]
        
        # append the row data into a new document
        proccessed_data.to_csv('magnitudes.csv', mode='a', index=False, header=first_file)
        first_file = False
    
    return 
path = 'raw_imu_data/'     
process_data(radar_gun, path)

In [80]:
pd.read_csv('magnitudes.csv')

Unnamed: 0,acceleration_0,acceleration_1,acceleration_2,acceleration_3,acceleration_4,acceleration_5,acceleration_6,acceleration_7,acceleration_8,acceleration_9,...,velocity_192,velocity_193,velocity_194,velocity_195,velocity_196,velocity_197,velocity_198,velocity_199,velocity_200,speed
0,40.267806,4.591906,40.939320,4.631849,41.626443,4.673024,42.329588,4.715496,43.049186,4.759334,...,43.878436,69.811863,44.009299,67.931676,44.136299,66.084627,44.259527,64.276220,44.379089,59
1,35.353324,6.551532,36.130051,6.608026,36.932782,6.666167,37.764277,6.726039,38.627357,6.787730,...,44.354233,55.460990,44.455643,54.599261,44.555156,53.790989,44.652897,53.033205,44.748986,63
2,20.623461,8.793857,20.965851,8.826784,21.321567,8.859985,21.691096,8.893478,22.074893,8.927277,...,37.999374,68.635315,38.123864,67.336328,38.245084,66.041512,38.363000,64.752636,38.477592,64
3,40.618824,4.929379,41.235261,4.967951,41.875856,5.007598,42.542012,5.048389,43.235081,5.090397,...,45.849827,72.254275,45.986249,70.498624,46.119135,68.798853,46.248619,67.160491,46.374840,65
4,36.808097,4.537868,37.353313,4.590959,37.913555,4.644995,38.489589,4.700011,39.082185,4.756044,...,44.401339,80.399933,44.550592,78.380078,44.695691,76.349556,44.836621,74.315216,44.973379,65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,42.666708,15.133016,43.670120,15.215444,44.707731,15.299951,45.781511,15.386617,46.893523,15.475522,...,62.012548,80.139807,62.154105,78.778138,62.292997,77.472685,62.429349,76.223917,62.563282,84
101,41.159132,12.841350,42.166239,12.923284,43.207505,13.007337,44.284819,13.093582,45.399976,13.182095,...,60.036358,82.291525,60.178619,80.856343,60.318081,79.471956,60.454864,78.139729,60.589082,84
102,41.275057,16.803493,42.400130,16.886574,43.567725,16.971923,44.779045,17.059623,46.035083,17.149761,...,65.181516,82.393571,65.329952,81.104120,65.475809,79.863011,65.619175,78.668724,65.760131,85
103,44.129649,12.463115,45.201714,12.549247,46.311676,12.637455,47.461334,12.727820,48.652356,12.820428,...,61.307041,89.573812,61.466654,88.268077,61.623709,87.016205,61.778323,85.817565,61.930604,85


In [None]:
# identify the area around the maximum x axis acceleration 

def identify_maximum_acceleration(df: pd.DataFrame, 
                                  numeric_column: Optional[str] = 'acc_x') -> pd.DataFrame:
    '''
    Slices a dataframe to just 100 observations before, and after the maximum value of a numeric value
    input:
        df: dataframe, renamed columns raw imu file data
        numeric_column: string, numeric values column name 'acc_x' by default
    '''
    # find the index for the maximum value in the numeric column
    maximum_idx = df[numeric_column].idxmax()
    
    # capture 100 frames before and after the maximum value
    maximum = df.loc[maximum_idx - 100: maximum_idx + 100]
    return maximum

    
peak_idx = processing_df['acc_x'].idxmax()
peak_acc = processing_df.loc[peak_idx - 100:peak_idx + 100 , :]