## Data Processing

* Rename columns
* Focus on acceleration and ignore the angular velocity (for now)
* Smoothen the values to help reduce noise
* Calculate the acceleration vector magnitude
* Focusing on readings near the acceleration peak
* Scale the data
* Create a training and testing datasets
    * Each row would contain the acceleration magnitudes for a single throw (file)
    * Append the radar gun reading to the observations
    * Optional: Add wrist velocities

In [26]:
import os
import joblib as jb
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import glob
from typing import Optional, List, Dict, Union

from scipy.integrate import cumtrapz # intergrate trapezoidal rule
from scipy.signal import savgol_filter

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

# graph styling
%matplotlib inline 
plt.style.use('seaborn-dark')
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 16

In [None]:
radar_gun = pd.read_csv('Speed-Filenames.csv')
radar_gun = radar_gun.sort_values(by=['Speed']).reset_index(drop=True)

In [27]:
def rename_observation_columns(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Rename the column names in the individual throws dataframes
    Note: Specific to this kaggle dataset columns
    inputs:
        df: dataframe, raw imu file data
    '''
    return df.rename(columns={
        "Time_s_": "time", 
        "Acc_x_m_s_2_": "acc_x",
        "Acc_y_m_s_2_": "acc_y",
        "Acc_z_m_s_2_": "acc_z",
        "Gyro_x_1_s_": "gyro_x",
        "Gyro_y_1_s_": "gyro_y",
        "Gyro_z_1_s_": "gyro_z",
    })

def filter_acceleration_columns(df: pd.DataFrame, regex: Optional[str] = 'acc.*') -> pd.DataFrame:
    '''
    Keeps only the acceleration columns (start with acc)
    inputs:
        df: dataframe, raw imu file data
        regex: string, a regular expression 'acc.*' by default
    '''
    return df.filter(regex=regex)

def rolling_average(vector: np.array, window_width: int) -> np.array:
    '''
    Calculate the rolling average of a vector
    input:
        vector: numpy array, input to calculate the moving average
        window_with: integer, defines how many units in the vector to average together
    '''
    cumsum_vec = np.cumsum(np.insert(vector, 0, 0))
    ma_vec = (cumsum_vec[window_width:] - cumsum_vec[:-window_width]) / window_width
    return ma_vec

In [None]:
def process_data(dictionary_df, path_to_data):
    filenames = dictionary_df['Filename']
    speeds = dictionary_df['Speed']
    first_file = True
    
    for i, filename in enumerate(filenames):
        path = f'{path_to_data}{filename}.txt'
        data = pd.read_csv(path)
        
        # rename column names
        proccessed_data = rename_observation_columns(data)
        
        # keep the acceleration columns
        proccessed_data = filter_acceleration_columns(proccessed_data)
        
        # establish pipeline steps
        pipe = pipeline_steps() 
        
        # fit and transform the data
        transormed = pipe.fit_transform(data)
        
        # append the associated speedometer reading
        n_records, _ = transormed.shape
        speed = [speeds[i] for _ in range(n_records)]
        transormed['speed'] = speed
        
        # split the acceleration, angular velocity and forward velocity.
        # create dataset for each feature
        acceleration = transormed.loc[['acc_mag'], :].to_csv('acceleration.csv', 
                                                              mode='a', index=False, 
                                                              header=first_file)  
        
        velocity = transormed.loc[['veloc_mag'], :].to_csv('velocity.csv', 
                                                            mode='a', index=False, 
                                                            header=first_file)  
        
        angular_velocity = transormed.loc[['gyro_mag'], :].to_csv('angular_velocity.csv', 
                                                                  mode='a', index=False, 
                                                                  header=first_file)
        
        first_file = False if i >= 0 and first_file == True else first_file
    return 
     
#process_data(speeds_df, path)