# Preprocessing

En este notebook usamos los datos RAW descargados en ingest.sh y los preprocesamos para crear un Dataframe. Mas adelante este df sera un input para el Feature Store.


# Step 1: JSON to CSV

In [4]:
import os

path = 'all_data'

# Listamos todos los ficheros que estan dentro de 'path' (carpeta bajada de garmin)
ficheros = os.listdir(path)
# ficheros

In [1]:
# otra alternativa es con una lista comprimida
# cap_languages = [language.capitalize() for language in languages]

# ficheros_json2 = [i.find(substr) != -1 for i in ficheros]
# ficheros_json2

In [30]:
# --- Creamos una lista de solo ficheros json
# con substr filtramos los archivos que nos interesan

substr = 'summary.json'
ficheros_json = []

for i in ficheros:

  if i.find(substr) != -1:
    ficheros_json.append(i)
    print('agregamos a la lista a: {}'.format(i))
  else:
    print(i)

In [31]:
# ---- Creamos una lista con todos los json (Actividades Garmin)
import json

lista_json = []

for fichero_x in range(len(ficheros_json)):

  # Ruta de todos los archivos json de actividad
  path = 'all_data/' + ficheros_json[fichero_x]
  
  # Abrimos y cargamos el fichero json
  with open(path) as file:
      jsonX = json.load(file)

  lista_json.append(jsonX)

lista_json

In [7]:
# tenemos una lista de los todos los jsons 
len(lista_json)

139

In [32]:
import pandas as pd
from pandas import json_normalize

# --- Extremos summaryDTO del json, xq es la parte que nos interesa analizar

# Creamos un df para guardar los datos extraidos del json
df = pd.DataFrame()

# Iteramos cada json de la lista
for miJson in lista_json:
    
    # Extremos el summaryDTO
    activityName = miJson['activityName']
    summaryDTO = miJson['summaryDTO']
    summaryDTO['activityName'] = activityName 
    
    # Agregamos los datos de cada summaryDTO a un df
    df = df.append(summaryDTO, ignore_index=True)

display(df)

In [16]:
df.columns.to_list()

['startTimeLocal',
 'startTimeGMT',
 'distance',
 'duration',
 'movingDuration',
 'elapsedDuration',
 'averageSpeed',
 'averageMovingSpeed',
 'maxSpeed',
 'calories',
 'averageHR',
 'maxHR',
 'averageRunCadence',
 'maxRunCadence',
 'minActivityLapDuration',
 'activityName',
 'activeSets',
 'avgVerticalSpeed',
 'isDecoDive',
 'totalSets',
 'strideLength',
 'endLatitude',
 'endLongitude',
 'maxElevation',
 'maxVerticalSpeed',
 'minElevation',
 'startLatitude',
 'startLongitude',
 'elevationGain',
 'numberOfActiveLengths',
 'totalExerciseReps',
 'elevationLoss']

In [14]:
# json_filt = {
#     'summaryDTO.calories': info['summaryDTO.calories'], 
#     'summaryDTO.distance': info['summaryDTO.distance'] 
#   }
# # json_filt
#   'calories': 148.92530345917206,
#   'distance': 1168.16,
#   'duration': 1257.692,
#   'elapsedDuration': 1257.692,
#   'maxHR': 131.0,

# miJson['summaryDTO']


In [10]:
# --- Creamos una lista con una preseleccion de features para nuestro mvp model
lista = [
 'activityName',
 'startTimeLocal', 
 'movingDuration',
 'averageHR',
 'maxHR',
 'calories',
 'distance',
 'duration',
 'averageSpeed',
 'maxSpeed'
]

In [17]:
df = df[lista]
df.head()

Unnamed: 0,activityName,startTimeLocal,movingDuration,averageHR,maxHR,calories,distance,duration,averageSpeed,maxSpeed
0,Caminar,2019-04-27T15:54:35.0,2088.0,110.0,142.0,306.919776,3579.31,2308.742,1.55,6.800049
1,Cardio,2019-05-09T19:57:25.0,39.0,144.0,167.0,,112.18,1245.555,0.090064,1.7
2,Carrera,2020-01-22T18:40:27.0,62.0,157.0,169.0,14.0,157.83,66.29,2.381,3.17
3,Caminar,2019-04-29T17:09:17.0,1741.0,135.0,148.0,327.922062,2912.31,1822.028,1.598,4.26001
4,Caminar,2019-05-19T17:27:23.0,1744.0,139.0,160.0,379.950454,2797.68,1764.769,1.585,5.109863


In [12]:
# # --- Transformamos startTimeLocal solo dia, mes y año (sin la hora)
# df['startTimeLocal'] = pd.to_datetime(df['startTimeLocal']).dt.date
# df[lista].head()

In [18]:
# Reemplazamos algunas categorias que se repiten con distintos nombres, pero son equivalentes para nuestro modelo
df = df.replace(
            {
                'Caminata': 'Caminar', 
                'Las Heras Caminata': 'Caminar',
                'Las Heras Caminar': 'Caminar',
                'Caminar': 'Caminar',
                'Mendoza Caminata': 'Caminar'
            }
)

In [19]:
df['activityName'].unique()

array(['Caminar', 'Cardio', 'Carrera', 'Entrenamiento de fuerza'],
      dtype=object)

In [20]:
df.isnull().sum()

activityName       0
startTimeLocal     0
movingDuration     0
averageHR          0
maxHR              0
calories          25
distance           0
duration           0
averageSpeed       0
maxSpeed           4
dtype: int64

Debido a problemas con la funcion 'load_feature_definitions' que no reconoce la col activityName, hacemos un encoding para pasarla a INT, con lo cual se resuelve el issue.

In [21]:
# Hacemos un encoding basico para la variable activityName
encoding_activityName_df = pd.get_dummies(df['activityName'], prefix_sep='_',prefix='activityName')
encoding_activityName_df.head()

Unnamed: 0,activityName_Caminar,activityName_Cardio,activityName_Carrera,activityName_Entrenamiento de fuerza
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,1,0,0,0
4,1,0,0,0


In [22]:
# Concatenamos las nuevas variables al df
df = pd.concat([df, encoding_activityName_df], axis=1)
df.head()

Unnamed: 0,activityName,startTimeLocal,movingDuration,averageHR,maxHR,calories,distance,duration,averageSpeed,maxSpeed,activityName_Caminar,activityName_Cardio,activityName_Carrera,activityName_Entrenamiento de fuerza
0,Caminar,2019-04-27T15:54:35.0,2088.0,110.0,142.0,306.919776,3579.31,2308.742,1.55,6.800049,1,0,0,0
1,Cardio,2019-05-09T19:57:25.0,39.0,144.0,167.0,,112.18,1245.555,0.090064,1.7,0,1,0,0
2,Carrera,2020-01-22T18:40:27.0,62.0,157.0,169.0,14.0,157.83,66.29,2.381,3.17,0,0,1,0
3,Caminar,2019-04-29T17:09:17.0,1741.0,135.0,148.0,327.922062,2912.31,1822.028,1.598,4.26001,1,0,0,0
4,Caminar,2019-05-19T17:27:23.0,1744.0,139.0,160.0,379.950454,2797.68,1764.769,1.585,5.109863,1,0,0,0


In [23]:
# Eliminamos la variable activityName, porque ya fue reemplazada por otras tres variables
df.drop('activityName', axis=1, inplace=True)
df.head()

Unnamed: 0,startTimeLocal,movingDuration,averageHR,maxHR,calories,distance,duration,averageSpeed,maxSpeed,activityName_Caminar,activityName_Cardio,activityName_Carrera,activityName_Entrenamiento de fuerza
0,2019-04-27T15:54:35.0,2088.0,110.0,142.0,306.919776,3579.31,2308.742,1.55,6.800049,1,0,0,0
1,2019-05-09T19:57:25.0,39.0,144.0,167.0,,112.18,1245.555,0.090064,1.7,0,1,0,0
2,2020-01-22T18:40:27.0,62.0,157.0,169.0,14.0,157.83,66.29,2.381,3.17,0,0,1,0
3,2019-04-29T17:09:17.0,1741.0,135.0,148.0,327.922062,2912.31,1822.028,1.598,4.26001,1,0,0,0
4,2019-05-19T17:27:23.0,1744.0,139.0,160.0,379.950454,2797.68,1764.769,1.585,5.109863,1,0,0,0


In [None]:
# del df['startTimeLocal']

Debido a que load_feature_definitions no reconoce la col 'startTimeLocal', hay que hacer algunas transfomaciones. Obtuvimos un buen resultado transformandola a INT.

In [24]:
# Transformamos a datetime para poder usar la funcion strftime()
df['startTimeLocal'] = pd.to_datetime(df['startTimeLocal']).dt.date

# Cambiamos el formato de date (sin guiones) para poder usar astype(int)
df['startTimeLocal'] = df['startTimeLocal'].apply(lambda x: x.strftime('%Y%m%d'))

# Transformamos a INT
df['startTimeLocal'] = df['startTimeLocal'].astype(int)

display(df['startTimeLocal'])
display(df.head(3))

0      20190427
1      20190509
2      20200122
3      20190429
4      20190519
         ...   
134    20190523
135    20200120
136    20200116
137    20200129
138    20190418
Name: startTimeLocal, Length: 139, dtype: int64

Unnamed: 0,startTimeLocal,movingDuration,averageHR,maxHR,calories,distance,duration,averageSpeed,maxSpeed,activityName_Caminar,activityName_Cardio,activityName_Carrera,activityName_Entrenamiento de fuerza
0,20190427,2088.0,110.0,142.0,306.919776,3579.31,2308.742,1.55,6.800049,1,0,0,0
1,20190509,39.0,144.0,167.0,,112.18,1245.555,0.090064,1.7,0,1,0,0
2,20200122,62.0,157.0,169.0,14.0,157.83,66.29,2.381,3.17,0,0,1,0


In [25]:
# renombramos col 'activityName_Entrenamiento de fuerza' por errores en 'garmin_feature_group.create'
df = df.rename({'activityName_Entrenamiento de fuerza': 'activityName_fuerza'}, axis='columns')
df.head(3)

Unnamed: 0,startTimeLocal,movingDuration,averageHR,maxHR,calories,distance,duration,averageSpeed,maxSpeed,activityName_Caminar,activityName_Cardio,activityName_Carrera,activityName_fuerza
0,20190427,2088.0,110.0,142.0,306.919776,3579.31,2308.742,1.55,6.800049,1,0,0,0
1,20190509,39.0,144.0,167.0,,112.18,1245.555,0.090064,1.7,0,1,0,0
2,20200122,62.0,157.0,169.0,14.0,157.83,66.29,2.381,3.17,0,0,1,0


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139 entries, 0 to 138
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   startTimeLocal        139 non-null    int64  
 1   movingDuration        139 non-null    float64
 2   averageHR             139 non-null    float64
 3   maxHR                 139 non-null    float64
 4   calories              114 non-null    float64
 5   distance              139 non-null    float64
 6   duration              139 non-null    float64
 7   averageSpeed          139 non-null    float64
 8   maxSpeed              135 non-null    float64
 9   activityName_Caminar  139 non-null    uint8  
 10  activityName_Cardio   139 non-null    uint8  
 11  activityName_Carrera  139 non-null    uint8  
 12  activityName_fuerza   139 non-null    uint8  
dtypes: float64(8), int64(1), uint8(4)
memory usage: 10.4 KB


In [29]:
# Guardamos el dataset preprocesado
df.to_csv('data.csv')

# Auxiliares

In [66]:
df = pd.read_csv('data.csv')
del df['Unnamed: 0']
df.head()

Unnamed: 0,activityName,startTimeLocal,movingDuration,averageHR,maxHR,calories,distance,duration,averageSpeed,maxSpeed
0,Caminata,2019-04-27,2088.0,110.0,142.0,306.919776,3579.31,2308.742,1.55,6.800049
1,Cardio,2019-05-09,39.0,144.0,167.0,,112.18,1245.555,0.090064,1.7
2,Carrera,2020-01-22,62.0,157.0,169.0,14.0,157.83,66.29,2.381,3.17
3,Caminata,2019-04-29,1741.0,135.0,148.0,327.922062,2912.31,1822.028,1.598,4.26001
4,Caminata,2019-05-19,1744.0,139.0,160.0,379.950454,2797.68,1764.769,1.585,5.109863


In [34]:
df = df.sort_values(by="startTimeLocal",ascending=True)
df.head()

Unnamed: 0,startTimeLocal,movingDuration,averageHR,maxHR,calories,distance,duration,averageSpeed,maxSpeed,activityName_Caminar,activityName_Cardio,activityName_Carrera,activityName_fuerza
46,20190406,1413.0,156.0,170.0,432.933494,2298.88,1636.874,1.404,4.26001,1,0,0,0
97,20190406,893.0,160.0,183.0,254.891385,1385.97,901.194,1.538,4.25,1,0,0,0
68,20190406,532.0,180.0,198.0,174.939499,1767.98,539.618,3.276,8.519989,0,0,1,0
76,20190406,1110.0,97.0,114.0,152.027914,1951.73,1207.587,1.616,4.25,1,0,0,0
93,20190406,152.0,130.0,143.0,33.890053,517.28,154.357,3.351,22.550003,0,0,1,0
