In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# evaluate every file in '../data/gc_data/athlete_2' and return a list of file names that are useable cycling workouts

In [3]:
# Create a files variable that contains all of our data files.
files = os.listdir('../data/gc_data/athlete_2')

# keep only the csv files
files = [file for file in files if '.csv' in file]
#files

In [4]:
# Determine if the current df is a useable cycling workout

def check_dataframe(cur_df):
    useable = True

    # Must have power data
    if cur_df['power'].sum() < 5:
        useable = False
        
    # Must have hr data
    elif cur_df['hr'].sum() < 5:
        useable = False
    
    # Must have cadence data
    elif cur_df['cad'].sum() < 5:
        useable = False
        
    # The altitude must stay within 10 feet of the starting elevation
    elif cur_df['alt'].max() > cur_df['alt'][0] + 10 or cur_df['alt'].min() < cur_df['alt'][0] - 10:
        useable = False

    return useable

In [5]:
useable_files = []

for file in files:
    df = pd.read_csv(f'../data/gc_data/athlete_2/{file}')
    
    # impute 0 for nulls
    df.fillna(0, inplace= True)
    #print(df.head())
    
    # check and see if the file us useable
    if check_dataframe(df):
        useable_files.append(file)

In [6]:
useable_files

['2018_12_03_20_52_34.csv',
 '2018_12_06_10_56_57.csv',
 '2018_12_09_19_56_36.csv',
 '2018_12_16_20_12_35.csv',
 '2018_12_19_11_05_47.csv',
 '2018_12_27_19_30_12.csv',
 '2018_12_28_19_36_49.csv',
 '2018_12_31_19_40_06.csv',
 '2019_01_04_18_49_51.csv',
 '2019_01_06_20_24_10.csv',
 '2019_01_06_21_05_16.csv',
 '2019_01_07_20_46_10.csv',
 '2019_01_09_11_23_10.csv',
 '2019_01_12_11_00_45.csv',
 '2019_01_14_20_45_35.csv',
 '2019_01_15_20_19_49.csv',
 '2019_01_19_10_38_22.csv',
 '2019_01_20_18_58_47.csv',
 '2019_01_24_20_05_51.csv',
 '2019_01_27_19_30_30.csv',
 '2019_01_28_20_10_24.csv',
 '2019_01_30_11_57_35.csv',
 '2019_01_31_11_30_27.csv',
 '2019_02_02_18_50_32.csv',
 '2019_02_03_18_14_30.csv',
 '2019_02_06_12_24_02.csv',
 '2019_02_07_11_45_23.csv',
 '2019_02_08_11_19_43.csv',
 '2019_02_09_09_23_35.csv',
 '2019_02_11_19_59_05.csv',
 '2019_02_13_20_52_52.csv',
 '2019_02_14_11_56_31.csv',
 '2019_02_16_19_23_30.csv',
 '2019_02_18_20_10_55.csv',
 '2019_02_19_12_48_49.csv',
 '2019_02_20_20_38_1

In [7]:
# Let's only use training sessions from June and July 2019. Summer should be peak fitness season, so the athlete's 
# fitness level should not change too dramatically within that timeframe.

useable_files = [file for file in useable_files if '2019_07' in file or '2019_06' in file]
len(useable_files)

34

In [8]:
# Use everything but the last 5 files for training
training_files = useable_files[:-5]

# Use the last 5 files for testing
test_files = useable_files[-5:]

In [9]:
training_files

['2019_06_01_19_17_14.csv',
 '2019_06_06_12_27_26.csv',
 '2019_06_07_19_15_37.csv',
 '2019_06_09_10_10_10.csv',
 '2019_06_10_20_32_43.csv',
 '2019_06_11_18_42_23.csv',
 '2019_06_12_19_08_15.csv',
 '2019_06_18_12_09_55.csv',
 '2019_06_19_11_24_48.csv',
 '2019_06_20_12_07_44.csv',
 '2019_06_21_19_43_02.csv',
 '2019_06_25_20_34_15.csv',
 '2019_06_28_19_03_00.csv',
 '2019_06_29_20_08_48.csv',
 '2019_07_02_19_08_49.csv',
 '2019_07_03_18_53_01.csv',
 '2019_07_04_18_22_50.csv',
 '2019_07_05_19_53_24.csv',
 '2019_07_06_17_32_26.csv',
 '2019_07_07_19_13_52.csv',
 '2019_07_08_19_01_29.csv',
 '2019_07_09_19_31_40.csv',
 '2019_07_09_20_08_44.csv',
 '2019_07_10_19_08_24.csv',
 '2019_07_12_12_11_44.csv',
 '2019_07_12_20_57_59.csv',
 '2019_07_13_19_12_26.csv',
 '2019_07_17_20_09_01.csv',
 '2019_07_18_19_14_30.csv']

In [10]:
test_files

['2019_07_19_20_04_54.csv',
 '2019_07_20_19_53_50.csv',
 '2019_07_23_20_09_12.csv',
 '2019_07_24_19_15_18.csv',
 '2019_07_31_19_30_08.csv']

## Concatenate the train and test files into train and test dataframes

In [11]:
def preprocess_df(current_df):
    
    # Drop the 'alt' and 'km' columns
    current_df.drop(columns= ['km', 'alt'], inplace = True)
    
    # 5 sec rolling mean
    current_df[['power', 'hr', 'cad']] = current_df[['power', 'hr', 'cad']].rolling(5).mean();    # 5 sec rolling mean
    current_df.dropna(inplace= True)
    
    return current_df

In [13]:
# preprocess and clean each of the test dataframes and save them individually for future use.
counter = 1
for file in test_files:
    file_df = pd.read_csv(f'../data/gc_data/athlete_2/{file}')
    
    file_df = preprocess_df(file_df)
    
    file_df.to_csv(f'../data/cleaned_gc_data/gc_test_{counter}.csv')
    
    counter += 1

In [14]:
train_df = pd.read_csv(f'../data/gc_data/athlete_2/{training_files[0]}')
test_df = pd.read_csv(f'../data/gc_data/athlete_2/{test_files[0]}')

# create training file dataframes concatenate those training dataframes into train_df

def generate_dataframe(files, concat_df):
    
    concat_df = preprocess_df(concat_df)
    
    for file in files[1:]:
        cur_df = pd.read_csv(f'../data/gc_data/athlete_2/{file}')
        
        cur_df = preprocess_df(cur_df) # preprocess_df function call
        
        concat_df = pd.concat([concat_df, cur_df], ignore_index= True)
    
    return concat_df

In [15]:
train_df = generate_dataframe(training_files, train_df)
train_df

Unnamed: 0,secs,power,hr,cad
0,4,33.4,76.0,28.2
1,5,57.4,77.0,39.6
2,6,80.8,78.0,47.4
3,7,100.6,79.0,56.0
4,8,111.4,80.0,59.6
...,...,...,...,...
109912,3480,0.0,92.4,0.0
109913,3481,0.0,92.0,0.0
109914,3482,0.0,91.6,0.0
109915,3483,0.0,91.2,0.0


In [16]:
test_df = generate_dataframe(test_files, test_df)
test_df

Unnamed: 0,secs,power,hr,cad
0,4,98.4,98.0,77.0
1,5,112.2,100.0,77.4
2,6,118.0,100.0,78.2
3,7,126.0,100.0,79.6
4,8,137.0,100.2,80.8
...,...,...,...,...
20434,5323,0.0,96.0,0.0
20435,5440,0.0,95.4,0.0
20436,5441,0.0,95.0,0.0
20437,5442,0.0,94.6,0.0


In [17]:
train_df.shape, test_df.shape

((109917, 4), (20439, 4))

In [18]:
train_df.isnull().sum(), test_df.isnull().sum()

(secs     0
 power    0
 hr       0
 cad      0
 dtype: int64,
 secs     0
 power    0
 hr       0
 cad      0
 dtype: int64)

In [19]:
train_df.dtypes, test_df.dtypes

(secs       int64
 power    float64
 hr       float64
 cad      float64
 dtype: object,
 secs       int64
 power    float64
 hr       float64
 cad      float64
 dtype: object)

In [20]:
# save out
train_df.to_csv('../data/cleaned_gc_data/gc_train.csv')
test_df.to_csv('../data/cleaned_gc_data/gc_test.csv')