In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# evaluate every file in '../data/gc_data/athlete_2' and return a list of file names that are useable cycling workouts

## Read in all available csv workout files

In [3]:
# Create a files variable that contains all of our data files.
files = os.listdir('../data/original_gc_data/')

# keep only the csv files
files = [file for file in files if '.csv' in file]

## Keep only the csv workout files that are useable

In [4]:
# Determine if the current df is a useable cycling workout

def check_dataframe(cur_df):
    useable = True

    # Must have power data
    if cur_df['power'].sum() < 5:
        useable = False
        
    # Must have hr data
    elif cur_df['hr'].sum() < 5:
        useable = False
    
    # Must have cadence data
    elif cur_df['cad'].sum() < 5:
        useable = False
        
    # The altitude must stay within 10 feet of the starting elevation
    elif cur_df['alt'].max() > cur_df['alt'][0] + 10 or cur_df['alt'].min() < cur_df['alt'][0] - 10:
        useable = False

    return useable

In [5]:
useable_files = []

for file in files:
    df = pd.read_csv(f'../data/original_gc_data//{file}')
    
    # impute 0 for nulls
    df.fillna(0, inplace= True)
    #print(df.head())
    
    # check and see if the file us useable
    if check_dataframe(df):
        useable_files.append(file)

In [6]:
useable_files

['2018_12_03_20_52_34.csv',
 '2018_12_06_10_56_57.csv',
 '2018_12_09_19_56_36.csv',
 '2018_12_16_20_12_35.csv',
 '2018_12_19_11_05_47.csv',
 '2018_12_27_19_30_12.csv',
 '2018_12_28_19_36_49.csv',
 '2018_12_31_19_40_06.csv',
 '2019_01_04_18_49_51.csv',
 '2019_01_06_20_24_10.csv',
 '2019_01_06_21_05_16.csv',
 '2019_01_07_20_46_10.csv',
 '2019_01_09_11_23_10.csv',
 '2019_01_12_11_00_45.csv',
 '2019_01_14_20_45_35.csv',
 '2019_01_15_20_19_49.csv',
 '2019_01_19_10_38_22.csv',
 '2019_01_20_18_58_47.csv',
 '2019_01_24_20_05_51.csv',
 '2019_01_27_19_30_30.csv',
 '2019_01_28_20_10_24.csv',
 '2019_01_30_11_57_35.csv',
 '2019_01_31_11_30_27.csv',
 '2019_02_02_18_50_32.csv',
 '2019_02_03_18_14_30.csv',
 '2019_02_06_12_24_02.csv',
 '2019_02_07_11_45_23.csv',
 '2019_02_08_11_19_43.csv',
 '2019_02_09_09_23_35.csv',
 '2019_02_11_19_59_05.csv',
 '2019_02_13_20_52_52.csv',
 '2019_02_14_11_56_31.csv',
 '2019_02_16_19_23_30.csv',
 '2019_02_18_20_10_55.csv',
 '2019_02_19_12_48_49.csv',
 '2019_02_20_20_38_1

## Narrow the search down

In [7]:
# Let's only use training sessions from June and July 2019. Summer should be peak fitness season (race season),
# and the athlete's fitness level should not change too dramatically within a couple months.

useable_files = [file for file in useable_files if '2019_07' in file or '2019_06' in file]
len(useable_files)

34

In [8]:
# Use everything but the last 5 files for training
training_files = useable_files[:-5]

# Use the last 5 files for testing
test_files = useable_files[-5:]

In [9]:
training_files

['2019_06_01_19_17_14.csv',
 '2019_06_06_12_27_26.csv',
 '2019_06_07_19_15_37.csv',
 '2019_06_09_10_10_10.csv',
 '2019_06_10_20_32_43.csv',
 '2019_06_11_18_42_23.csv',
 '2019_06_12_19_08_15.csv',
 '2019_06_18_12_09_55.csv',
 '2019_06_19_11_24_48.csv',
 '2019_06_20_12_07_44.csv',
 '2019_06_21_19_43_02.csv',
 '2019_06_25_20_34_15.csv',
 '2019_06_28_19_03_00.csv',
 '2019_06_29_20_08_48.csv',
 '2019_07_02_19_08_49.csv',
 '2019_07_03_18_53_01.csv',
 '2019_07_04_18_22_50.csv',
 '2019_07_05_19_53_24.csv',
 '2019_07_06_17_32_26.csv',
 '2019_07_07_19_13_52.csv',
 '2019_07_08_19_01_29.csv',
 '2019_07_09_19_31_40.csv',
 '2019_07_09_20_08_44.csv',
 '2019_07_10_19_08_24.csv',
 '2019_07_12_12_11_44.csv',
 '2019_07_12_20_57_59.csv',
 '2019_07_13_19_12_26.csv',
 '2019_07_17_20_09_01.csv',
 '2019_07_18_19_14_30.csv']

In [10]:
test_files

['2019_07_19_20_04_54.csv',
 '2019_07_20_19_53_50.csv',
 '2019_07_23_20_09_12.csv',
 '2019_07_24_19_15_18.csv',
 '2019_07_31_19_30_08.csv']

### Preprocess each of the test dataframes and save them individually for future use

In [11]:
counter = 1
for file in test_files:
    
    # read in file to dataframe
    file_df = pd.read_csv(f'../data/original_gc_data/{file}')
    
    # get rid of unneeded columns
    file_df.drop(columns= ['km', 'alt'], inplace = True)
    
    # write to csv
    file_df.to_csv(f'../data/cleaned_gc_data/gc_test_{counter}.csv')
    counter += 1

## Concatenate the train and test files into train and test dataframes

In [12]:
train_df = pd.read_csv(f'../data/original_gc_data/{training_files[0]}')
test_df = pd.read_csv(f'../data/original_gc_data/{test_files[0]}')

def generate_dataframe(files, concat_df): # Takes a list of file names, and a train or test dataframe as input
    
    concat_df.drop(columns= ['km', 'alt'], inplace = True)
    
    for file in files[1:]:
        # read in file to dataframe
        cur_df = pd.read_csv(f'../data/original_gc_data/{file}')
        
        # Get rid of unneeded columns
        cur_df.drop(columns= ['km', 'alt'], inplace = True)
        
        # write to csv
        concat_df = pd.concat([concat_df, cur_df], ignore_index= True)
    
    return concat_df # Returns a completed train or test dataframe

In [13]:
# Dataframe for model training
train_df = generate_dataframe(training_files, train_df)
train_df

Unnamed: 0,secs,power,hr,cad
0,0,0.0,74.0,0.0
1,1,1.0,75.0,22.0
2,2,11.0,76.0,22.0
3,3,53.0,77.0,48.0
4,4,102.0,78.0,49.0
...,...,...,...,...
110028,3480,0.0,91.0,0.0
110029,3481,0.0,91.0,0.0
110030,3482,0.0,91.0,0.0
110031,3483,0.0,91.0,0.0


In [14]:
# Dataframe for model testing
test_df = generate_dataframe(test_files, test_df)
test_df

Unnamed: 0,secs,power,hr,cad
0,0,75,90.0,76.0
1,1,95,100.0,76.0
2,2,89,100.0,75.0
3,3,92,100.0,78.0
4,4,141,100.0,80.0
...,...,...,...,...
20454,5323,0,96.0,0.0
20455,5440,0,93.0,0.0
20456,5441,0,94.0,0.0
20457,5442,0,94.0,0.0


In [15]:
# Training dataset size, Test dataset size
train_df.shape, test_df.shape

((110033, 4), (20459, 4))

### Check data types and check for nulls

In [16]:
train_df.isnull().sum(), test_df.isnull().sum()

(secs     0
 power    0
 hr       0
 cad      0
 dtype: int64,
 secs     0
 power    0
 hr       0
 cad      0
 dtype: int64)

In [17]:
train_df.dtypes, test_df.dtypes

(secs       int64
 power    float64
 hr       float64
 cad      float64
 dtype: object,
 secs       int64
 power      int64
 hr       float64
 cad      float64
 dtype: object)

## Save out

In [18]:
train_df.to_csv('../data/cleaned_gc_data/gc_train.csv')
test_df.to_csv('../data/cleaned_gc_data/gc_test.csv')