<a href="https://colab.research.google.com/github/JRKagumba/2D-video-based-running-analysis/blob/main/notebooks/01_Apply_Preprocessing_steps_to_Raw_Data_and_Save_Plots.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Mount Drive

In [20]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


### Import Libraries

In [21]:
import pandas as pd
import numpy as np
import json
import os

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker 

from scipy import stats

### Initialize Helper Functions 

In [22]:
def zeros_to_nans(df):
    """If keypoint is not detected, OpenPose returns 0 for undetected keypoints"""
    df[df < 0.0001] = np.NaN
    return df

def apply_interpolation(df):
    """Fill in missing values"""
    return df.apply(lambda col: col.interpolate(), axis=0)

def remove_high_frequency_noise_with_rolling_window_function(df):
    """Apply smoothing with a Gaussian filter"""
    return df.rolling(window=5, win_type='gaussian', center=True).mean(std=5)

def cut_out_noise_at_beginning_of_clips(df):
    """
    Remove noisy signals present before individual has started to run
    """
    #2a A df with all the row-row differences
    difference_df = abs(df.apply(lambda x: x.diff(), axis=0))

    #2b A series list all columns with the index number that had the max value
    max_df = difference_df.apply(lambda x: x.idxmax(), axis=0)

    #2c slice df to remove noisy portion 
    frequency, bins = np.histogram(max_df, bins=10, range=[min(max_df), max(max_df)])
    bin_index_of_max = np.argmax(frequency)
    df_index_of_max = round(bins[bin_index_of_max+1])
    no_noise_df=df[df_index_of_max+1:].reset_index(drop=True)

    return no_noise_df


def remove_high_frequency_noise_at_single_index(df):
    """
    Removes high frequency noise at a particularly troublesome index
    """

    #2a A df with all the row-row differences
    difference_df = abs(df.apply(lambda x: x.diff(), axis=0))

    #2b A series list all columns with the index number that had the max value
    max_df = difference_df.apply(lambda x: x.idxmax(), axis=0)

    # ind = index number in dataframe that had the max differences
    ind = stats.mode(max_df)[0][0]

    df.iloc[ind-1:ind] = np.NaN
    df_processed = df.apply(lambda col: col.interpolate(), axis=0)
    return df_processed


def apply_preprocessing_steps(df: pd.DataFrame, sample: str, player_metrics: pd.DataFrame):

    df_processed = df.copy()
    df_processed = zeros_to_nans(df_processed)
    df_processed = apply_interpolation(df_processed)


    trimmed_df = cut_out_noise_at_beginning_of_clips(df_processed)


    row_index = player_metrics.index[player_metrics['code'] == sample]
    column_index = '40_time'
    required_frames = int(round(player_metrics.loc[row_index,column_index]*30))


    if len(trimmed_df) >= required_frames:
        trimmed_df = trimmed_df.tail(required_frames)
        trimmed_df = remove_high_frequency_noise_at_single_index(trimmed_df)
        for i in range(3):
            trimmed_df = remove_high_frequency_noise_at_single_index(trimmed_df)
        trimmed_df = remove_high_frequency_noise_with_rolling_window_function(trimmed_df)

        return(trimmed_df)
    else:
        df_processed = df_processed.tail(required_frames)
        for i in range(10):
            df_processed = remove_high_frequency_noise_at_single_index(df_processed)
        df_processed = remove_high_frequency_noise_with_rolling_window_function(df_processed)

        return(df_processed)




keywords_dict = {
       0 : 'NOSE',  1 : 'NECK',  2 : 'RSHO',  3 : 'RELB',  4 : 'RWRI',
       5 : 'LSHO',  6 : 'LELB',  7 : 'LWRI',  8 : 'MHIP',  9 : 'RHIP',
      10 : 'RKNE', 11 : 'RANK', 12 : 'LHIP', 13 : 'LKNE', 14 : 'LANK',
      15 : 'REYE', 16 : 'LEYE', 17 : 'REAR', 18 : 'LEAR', 19 : 'LBTO',
      20 : 'LSTO', 21 : 'LHEL', 22 : 'RBTO', 23 : 'RSTO', 24 : 'RHEL'
      }

def save_plot_as_array_of_subplots(df, plot_name, save_path):
    """
    Saves plot in path, no return type
    Serves as a quick view of what the csv data looks like
    """
    fig, axs = plt.subplots(5, 5, figsize=(25, 25), constrained_layout=True)

    for ax, val in zip(axs.flat, keywords_dict.values()):

        ax.set_title(f'{val}')
        ax.set_xlabel('Time (%)', fontsize=10)
        ax.set_ylabel(f'{val}-Position', fontsize=10)

        ax.plot(df[f'{val}_X'], label=f'{val}_X')
        ax.plot(df[f'{val}_Y'], label=f'{val}_Y')

        ax.legend()
        ax.margins(x=0.01)
        
        ax.xaxis.set_major_locator(ticker.MultipleLocator(len(df)/5))
        ax.xaxis.set_minor_locator(ticker.MultipleLocator(len(df)/20))
        ax.xaxis.set_major_formatter(ticker.PercentFormatter(xmax=len(df)))
    
    fig.suptitle(plot_name, fontsize=25)
    plt.savefig(save_path)
    plt.close(fig)

### Configure Save Paths

In [23]:
PROJ_SAVE_ROOT = os.path.join('/content/gdrive/MyDrive/ColabNotebooks/BiomechanicsAnalysis/___40_YARD_DASH/data/processed')
positions_list = os.listdir(PROJ_SAVE_ROOT)


player_metrics_file_path = '/content/gdrive/MyDrive/ColabNotebooks/BiomechanicsAnalysis/___40_YARD_DASH/data/40_yard_player_metrics.csv'
player_metrics = pd.read_csv(player_metrics_file_path, usecols=['code','40_time'])

for position_folder in positions_list:
    position_samples =  os.listdir(os.path.join(PROJ_SAVE_ROOT, position_folder))

    print(position_folder)
    for sample in position_samples:

        print(f'\t{sample}')

        # Define paths
        sample_csv_path = os.path.join(PROJ_SAVE_ROOT, position_folder, sample, f'{sample}.csv')
        raw_plot_path = os.path.join(PROJ_SAVE_ROOT, position_folder, sample, f'{sample}_plot_raw.png')

        processed_data_path = os.path.join(PROJ_SAVE_ROOT, position_folder, sample, f'{sample}_data_processed.csv')
        processed_plot_path = os.path.join(PROJ_SAVE_ROOT, position_folder, sample, f'{sample}_plot_processed.png')

        # Apply processing steps
        df = pd.read_csv(sample_csv_path, index_col=0)
        df_processed = apply_preprocessing_steps(df, sample, player_metrics)

        #Save data
        save_plot_as_array_of_subplots(df, sample, raw_plot_path)
        print(f'\t\t{sample:20} raw plot saved')

        df_processed.to_csv(processed_data_path)  
        print(f'\t\t{sample:20} processed data saved')

        save_plot_as_array_of_subplots(df_processed, sample, processed_plot_path)
        print(f'\t\t{sample:20} processed plot saved')

defensive_back
	defensive_back_01
		defensive_back_01    raw plot saved
		defensive_back_01    processed data saved
		defensive_back_01    processed plot saved
	defensive_back_02
		defensive_back_02    raw plot saved
		defensive_back_02    processed data saved
		defensive_back_02    processed plot saved
	defensive_back_03
		defensive_back_03    raw plot saved
		defensive_back_03    processed data saved
		defensive_back_03    processed plot saved
	defensive_back_04
		defensive_back_04    raw plot saved
		defensive_back_04    processed data saved
		defensive_back_04    processed plot saved
	defensive_back_05
		defensive_back_05    raw plot saved
		defensive_back_05    processed data saved
		defensive_back_05    processed plot saved
	defensive_back_06
		defensive_back_06    raw plot saved
		defensive_back_06    processed data saved
		defensive_back_06    processed plot saved
	defensive_back_07
		defensive_back_07    raw plot saved
		defensive_back_07    processed data saved
		defensive_ba