# Feature Store
En este notebook creamos un FS de nuestras actividades deportivas diarias. 




# Step 1: Set Up

In [2]:
# SageMaker Python SDK version 2.x is required
import sagemaker
import sys

original_version = sagemaker.__version__
%pip install 'sagemaker>=2.0.0'


You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
!pip3 install numpy==1.19.5

Collecting numpy==1.19.5
  Using cached numpy-1.19.5-cp37-cp37m-manylinux2010_x86_64.whl (14.8 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.6
    Uninstalling numpy-1.21.6:
      Successfully uninstalled numpy-1.21.6
Successfully installed numpy-1.19.5
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [8]:
import pandas as pd
import numpy as np

import boto3
import io

from sagemaker.session import Session
from sagemaker import get_execution_role

prefix = "sagemaker-featurestore-garmin"
role = get_execution_role()

sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
s3_bucket_name = sagemaker_session.default_bucket()

In [9]:
s3_bucket_name

'sagemaker-us-east-1-007007875348'

# Step 2: Import Data
Importamos los datos que fueron preprocesados en preprocessing.py



In [6]:
# df = pd.read_csv("data.csv", index=False)
df = pd.read_csv("../data.csv")
del df['Unnamed: 0']

In [7]:
df.head()

Unnamed: 0,startTimeLocal,movingDuration,averageHR,maxHR,calories,distance,duration,averageSpeed,maxSpeed,EventTime,activityName_Caminar,activityName_Cardio,activityName_Carrera,activityName_Entrenamiento de fuerza
0,2019-04-27,2088.0,110.0,142.0,306.919776,3579.31,2308.742,1.55,6.800049,1654777000.0,1,0,0,0
1,2019-05-09,39.0,144.0,167.0,,112.18,1245.555,0.090064,1.7,1654777000.0,0,1,0,0
2,2020-01-22,62.0,157.0,169.0,14.0,157.83,66.29,2.381,3.17,1654777000.0,0,0,1,0
3,2019-04-29,1741.0,135.0,148.0,327.922062,2912.31,1822.028,1.598,4.26001,1654777000.0,1,0,0,0
4,2019-05-19,1744.0,139.0,160.0,379.950454,2797.68,1764.769,1.585,5.109863,1654777000.0,1,0,0,0


In [8]:
df.dtypes

startTimeLocal                           object
movingDuration                          float64
averageHR                               float64
maxHR                                   float64
calories                                float64
distance                                float64
duration                                float64
averageSpeed                            float64
maxSpeed                                float64
EventTime                               float64
activityName_Caminar                      int64
activityName_Cardio                       int64
activityName_Carrera                      int64
activityName_Entrenamiento de fuerza      int64
dtype: object

In [29]:
# df['activityName'] = df['activityName'].astype(str)
# df.dtypes

# Step 3: Create feature groups

Creamos el Feature Group name para los datos importados, luego creamos un objeto para instanciar el FG.

In [14]:
import time
from time import strftime, gmtime

feature_group_name = 'garmin-feature-group-' + strftime('%d-%H-%M-%S', gmtime())

In [15]:
display(feature_group_name)

'garmin-feature-group-11-14-12-17'

Instanciamos un objeto FeatureGroup para los datos garmin.

In [2]:
from sagemaker.feature_store.feature_group import FeatureGroup

garmin_feature_group = FeatureGroup(
    name=feature_group_name, 
    sagemaker_session=sagemaker_session
)

In [133]:
feature_group

FeatureGroup(name='garmin-feature-group-11-14-12-17', sagemaker_session=<sagemaker.session.Session object at 0x7f495d531150>, feature_definitions=[FeatureDefinition(feature_name='startTimeLocal', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>), FeatureDefinition(feature_name='movingDuration', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>), FeatureDefinition(feature_name='averageHR', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>), FeatureDefinition(feature_name='maxHR', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>), FeatureDefinition(feature_name='calories', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>), FeatureDefinition(feature_name='distance', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>), FeatureDefinition(feature_name='duration', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>), FeatureDefinition(feature_name='averageSpeed', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>), FeatureDefinition(feature_name='max

In [137]:
import time

current_time_sec = int(round(time.time()))
record_identifier_feature_name = "startTimeLocal"

Agregamos un EventTime feature en el dataframe. Es un parametro requerido para el FG.


In [16]:
df["EventTime"] = pd.Series([current_time_sec]*len(df), dtype="float64")

In [36]:
# Reemplazamos algunas categorias que se repiten con distintos nombres, pero son equivalentes para nuestro modelo
df = df.replace(
            {
                'Caminata': 'Caminar', 
                'Las Heras Caminata': 'Caminar',
                'Las Heras Caminar': 'Caminar',
                'Caminar': 'Caminar',
                'Mendoza Caminata': 'Caminar'
            }
)

In [37]:
df['activityName'].unique()

array(['Caminar', 'Cardio', 'Carrera', 'Entrenamiento de fuerza'],
      dtype=object)

In [17]:
df.isnull().sum()

startTimeLocal                           0
movingDuration                           0
averageHR                                0
maxHR                                    0
calories                                25
distance                                 0
duration                                 0
averageSpeed                             0
maxSpeed                                 4
EventTime                                0
activityName_Caminar                     0
activityName_Cardio                      0
activityName_Carrera                     0
activityName_Entrenamiento de fuerza     0
dtype: int64

Debido a problemas con la funcion 'load_feature_definitions' que no reconoce la col activityName, hacemos un encoding para pasarla a INT, con lo cual se resuelve el issue.

In [42]:
# Hacemos un encoding basico para la variable activityName
encoding_activityName_df = pd.get_dummies(df['activityName'], prefix_sep='_',prefix='activityName')
encoding_activityName_df.head()

Unnamed: 0,activityName_Caminar,activityName_Cardio,activityName_Carrera,activityName_Entrenamiento de fuerza
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,1,0,0,0
4,1,0,0,0


In [43]:
# Concatenamos las nuevas variables al df
df = pd.concat([df, encoding_activityName_df], axis=1)
df.head()

Unnamed: 0,activityName,startTimeLocal,movingDuration,averageHR,maxHR,calories,distance,duration,averageSpeed,maxSpeed,EventTime,activityName_Caminar,activityName_Cardio,activityName_Carrera,activityName_Entrenamiento de fuerza
0,Caminar,2019-04-27,2088.0,110.0,142.0,306.919776,3579.31,2308.742,1.55,6.800049,1654777000.0,1,0,0,0
1,Cardio,2019-05-09,39.0,144.0,167.0,,112.18,1245.555,0.090064,1.7,1654777000.0,0,1,0,0
2,Carrera,2020-01-22,62.0,157.0,169.0,14.0,157.83,66.29,2.381,3.17,1654777000.0,0,0,1,0
3,Caminar,2019-04-29,1741.0,135.0,148.0,327.922062,2912.31,1822.028,1.598,4.26001,1654777000.0,1,0,0,0
4,Caminar,2019-05-19,1744.0,139.0,160.0,379.950454,2797.68,1764.769,1.585,5.109863,1654777000.0,1,0,0,0


In [45]:
# Eliminamos la variable activityName, porque ya fue reemplazada por otras tres variables
df.drop('activityName', axis=1, inplace=True)
df.head()

Unnamed: 0,startTimeLocal,movingDuration,averageHR,maxHR,calories,distance,duration,averageSpeed,maxSpeed,EventTime,activityName_Caminar,activityName_Cardio,activityName_Carrera,activityName_Entrenamiento de fuerza
0,2019-04-27,2088.0,110.0,142.0,306.919776,3579.31,2308.742,1.55,6.800049,1654777000.0,1,0,0,0
1,2019-05-09,39.0,144.0,167.0,,112.18,1245.555,0.090064,1.7,1654777000.0,0,1,0,0
2,2020-01-22,62.0,157.0,169.0,14.0,157.83,66.29,2.381,3.17,1654777000.0,0,0,1,0
3,2019-04-29,1741.0,135.0,148.0,327.922062,2912.31,1822.028,1.598,4.26001,1654777000.0,1,0,0,0
4,2019-05-19,1744.0,139.0,160.0,379.950454,2797.68,1764.769,1.585,5.109863,1654777000.0,1,0,0,0


In [46]:
# Guardamos el dataset preprocesado
df.to_csv('../data.csv')

In [3]:
# # df = pd.read_csv("data.csv", index=False)
# df = pd.read_csv("../data.csv")
# del df['Unnamed: 0']

# display(df.head(3))
# display(df.dtypes)

In [44]:
# del df['startTimeLocal']

Debido a que load_feature_definitions no reconoce la col 'startTimeLocal', hay que hacer algunas transfomaciones. Obtuvimos un buen resultado transformandola a INT.

In [130]:
# Transformamos a datetime para poder usar la funcion strftime()
df['startTimeLocal'] = pd.to_datetime(df['startTimeLocal']).dt.date

# Cambiamos el formato de date (sin guiones) para poder usar astype(int)
df['startTimeLocal'] = df['startTimeLocal'].apply(lambda x: x.strftime('%Y%m%d'))

# Transformamos a INT
df['startTimeLocal'] = df['startTimeLocal'].astype(int)

display(df['startTimeLocal'])
display(df.head(3))

0      20190427
1      20190509
2      20200122
3      20190429
4      20190519
         ...   
130    20190523
131    20200120
132    20200116
133    20200129
134    20190418
Name: startTimeLocal, Length: 135, dtype: int64

Unnamed: 0,startTimeLocal,movingDuration,averageHR,maxHR,calories,distance,duration,averageSpeed,maxSpeed,EventTime,activityName_Caminar,activityName_Cardio,activityName_Carrera,activityName_Entrenamiento de fuerza
0,20190427,2088.0,110.0,142.0,306.919776,3579.31,2308.742,1.55,6.800049,1654777000.0,1,0,0,0
1,20190509,39.0,144.0,167.0,,112.18,1245.555,0.090064,1.7,1654777000.0,0,1,0,0
2,20200122,62.0,157.0,169.0,14.0,157.83,66.29,2.381,3.17,1654777000.0,0,0,1,0


In [139]:
# renombramos col 'activityName_Entrenamiento de fuerza' por errores en 'garmin_feature_group.create'
df = df.rename({'activityName_Entrenamiento de fuerza': 'activityName_fuerza'}, axis='columns')
df.head(3)

Unnamed: 0,startTimeLocal,movingDuration,averageHR,maxHR,calories,distance,duration,averageSpeed,maxSpeed,EventTime,activityName_Caminar,activityName_Cardio,activityName_Carrera,activityName_fuerza
0,20190427,2088.0,110.0,142.0,306.919776,3579.31,2308.742,1.55,6.800049,1654777000.0,1,0,0,0
1,20190509,39.0,144.0,167.0,,112.18,1245.555,0.090064,1.7,1654777000.0,0,1,0,0
2,20200122,62.0,157.0,169.0,14.0,157.83,66.29,2.381,3.17,1654777000.0,0,0,1,0


Usamos la funcion load_feature_definitions para usar mas adelante en el FG create.


In [140]:
garmin_feature_group.load_feature_definitions(data_frame=df)

[FeatureDefinition(feature_name='startTimeLocal', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>),
 FeatureDefinition(feature_name='movingDuration', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='averageHR', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='maxHR', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='calories', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='distance', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='duration', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='averageSpeed', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='maxSpeed', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>),
 FeatureDefinition(feature_name='EventTime', feature_type=<FeatureTypeEn

Creamos el FG


In [141]:
garmin_feature_group.create(
    s3_uri=f"s3://{s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name="EventTime",
    role_arn=role,
    enable_online_store=True
)

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:007007875348:feature-group/garmin-feature-group-11-14-12-17',
 'ResponseMetadata': {'RequestId': '1528c94b-5151-44a1-9d2e-b65fa4510d08',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1528c94b-5151-44a1-9d2e-b65fa4510d08',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '109',
   'date': 'Sat, 11 Jun 2022 16:01:05 GMT'},
  'RetryAttempts': 0}}

In [142]:
record_identifier_feature_name

'startTimeLocal'

Para verificar el FG create usamos las APIs FeatureGroup.Describe y FeatureGroups.List 


In [144]:
garmin_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:007007875348:feature-group/garmin-feature-group-11-14-12-17',
 'FeatureGroupName': 'garmin-feature-group-11-14-12-17',
 'RecordIdentifierFeatureName': 'startTimeLocal',
 'EventTimeFeatureName': 'EventTime',
 'FeatureDefinitions': [{'FeatureName': 'startTimeLocal',
   'FeatureType': 'Integral'},
  {'FeatureName': 'movingDuration', 'FeatureType': 'Fractional'},
  {'FeatureName': 'averageHR', 'FeatureType': 'Fractional'},
  {'FeatureName': 'maxHR', 'FeatureType': 'Fractional'},
  {'FeatureName': 'calories', 'FeatureType': 'Fractional'},
  {'FeatureName': 'distance', 'FeatureType': 'Fractional'},
  {'FeatureName': 'duration', 'FeatureType': 'Fractional'},
  {'FeatureName': 'averageSpeed', 'FeatureType': 'Fractional'},
  {'FeatureName': 'maxSpeed', 'FeatureType': 'Fractional'},
  {'FeatureName': 'EventTime', 'FeatureType': 'Fractional'},
  {'FeatureName': 'activityName_Caminar', 'FeatureType': 'Integral'},
  {'FeatureName': 'activityName_Card

In [145]:
sagemaker_session.boto_session.client('sagemaker', region_name=region).list_feature_groups() # We use the boto client to list FeatureGroups

{'FeatureGroupSummaries': [{'FeatureGroupName': 'garmin-feature-group-11-14-12-17',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:007007875348:feature-group/garmin-feature-group-11-14-12-17',
   'CreationTime': datetime.datetime(2022, 6, 11, 16, 1, 6, 6000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'}],
 'ResponseMetadata': {'RequestId': '58bcc680-6d67-4c2f-862e-115f9497da86',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '58bcc680-6d67-4c2f-862e-115f9497da86',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '254',
   'date': 'Sat, 11 Jun 2022 16:04:57 GMT'},
  'RetryAttempts': 0}}

# Step 4: Usamos ingest para cargar datos al FG



In [147]:
def check_feature_group_status(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group to be Created")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    print(f"FeatureGroup {feature_group.name} successfully created.")

check_feature_group_status(garmin_feature_group)

FeatureGroup garmin-feature-group-11-14-12-17 successfully created.


In [148]:
garmin_feature_group.ingest(
    
    data_frame=df, 
    max_workers=3, 
    wait=True
)

IngestionManagerPandas(feature_group_name='garmin-feature-group-11-14-12-17', sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7f495cae97d0>, max_workers=3, max_processes=1, profile_name=None, _async_result=<multiprocess.pool.MapResult object at 0x7f495c189350>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

Llamamos al FG para leer un registro de la tabla cargada usando boto3.


In [11]:
starttimelocal = 20190427
feature_group_name = 'garmin-feature-group-11-14-12-17'

sagemaker_client_leo = sagemaker_session.boto_session.client('sagemaker-featurestore-runtime', region_name=region)

sample_record = sagemaker_client_leo.get_record(
    
        FeatureGroupName=feature_group_name, 
        RecordIdentifierValueAsString=str(starttimelocal)
)

sample_record

{'ResponseMetadata': {'RequestId': 'd4896894-22ce-4a0f-a425-a815f4ed89de',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'd4896894-22ce-4a0f-a425-a815f4ed89de',
   'content-type': 'application/json',
   'content-length': '822',
   'date': 'Sun, 12 Jun 2022 15:14:53 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'startTimeLocal', 'ValueAsString': '20190427'},
  {'FeatureName': 'movingDuration', 'ValueAsString': '2088.0'},
  {'FeatureName': 'averageHR', 'ValueAsString': '110.0'},
  {'FeatureName': 'maxHR', 'ValueAsString': '142.0'},
  {'FeatureName': 'calories', 'ValueAsString': '306.91977603925517'},
  {'FeatureName': 'distance', 'ValueAsString': '3579.31'},
  {'FeatureName': 'duration', 'ValueAsString': '2308.742'},
  {'FeatureName': 'averageSpeed', 'ValueAsString': '1.5499999523162844'},
  {'FeatureName': 'maxSpeed', 'ValueAsString': '6.800048828125'},
  {'FeatureName': 'EventTime', 'ValueAsString': '1654776648.0'},
  {'FeatureName': 'activityName_Caminar',

Ahora leemos un batch de registros con batch_get_record


In [13]:
all_records = sagemaker_session.boto_session.client("sagemaker-featurestore-runtime", region_name=region).batch_get_record(
    Identifiers=[
        {
            "FeatureGroupName": feature_group_name,
            "RecordIdentifiersValueAsString": ["20190427", "20200121", "20200205", "20200210"],
        },
    ]
)

In [14]:
all_records

{'ResponseMetadata': {'RequestId': 'a7100157-b546-46c5-bcf6-a81eda5fc484',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a7100157-b546-46c5-bcf6-a81eda5fc484',
   'content-type': 'application/json',
   'content-length': '3699',
   'date': 'Sun, 12 Jun 2022 15:19:22 GMT'},
  'RetryAttempts': 0},
 'Records': [{'FeatureGroupName': 'garmin-feature-group-11-14-12-17',
   'RecordIdentifierValueAsString': '20200210',
   'Record': [{'FeatureName': 'startTimeLocal', 'ValueAsString': '20200210'},
    {'FeatureName': 'movingDuration', 'ValueAsString': '584.0'},
    {'FeatureName': 'averageHR', 'ValueAsString': '133.0'},
    {'FeatureName': 'maxHR', 'ValueAsString': '176.0'},
    {'FeatureName': 'calories', 'ValueAsString': '119.0'},
    {'FeatureName': 'distance', 'ValueAsString': '1030.22'},
    {'FeatureName': 'duration', 'ValueAsString': '780.243'},
    {'FeatureName': 'averageSpeed', 'ValueAsString': '1.3200000524520874'},
    {'FeatureName': 'maxSpeed', 'ValueAsString': '3.

# Step 4B: Ingestar nuevas actividades deportivas
