### Assignment 3: Feature Stores

Credits: I used the Amazon Documentation Developer Guide to help with the feature store. Link provided [HERE](https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store-fraud-detection-notebook.html)

### Creating Feature Store Session

In [1]:
# Import the necessary libraries
# Libraries for creating feature store session
import boto3
import sagemaker
from sagemaker.session import Session
from sagemaker import get_execution_role

# Libraries for interacting with the dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io

# Feature Group
import time
from time import gmtime, strftime, sleep
from sagemaker.feature_store.feature_group import FeatureGroup

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [2]:
# Helpful Functions + Variables stored here
def encode_col(df, col):
    names = df[col].unique()
    values = len(names)
    dict_pairs = dict([(key,value) for _, (key,value) in enumerate(zip(names,np.arange(values)))])
    df[col] = df[col].map(dict_pairs)
    df[col] = df[col].astype('float64')
    return df, dict_pairs

# def encode_col(df, col):
#     df[col], _ = df[col].factorize()
#     df[col] = df[col].astype('float64')
#     return df

def convert_to_strings(df):
    for col in df.columns:
        if df.dtypes[col] == 'object':
            df[col] = df[col].astype(str)
            
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

            
time_now = int(round(time.time()))

In [3]:
# Create the session by identifying the variables
region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

In [4]:
# Creating default bucket
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "sagemaker-featurestore-demo"

print(default_s3_bucket_name)

sagemaker-us-east-1-004608622582


In [5]:
# Grab Role
role = get_execution_role()
print(role)

arn:aws:iam::004608622582:role/LabRole


In [6]:
# Start the client + feature store runtime
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

In [7]:
# Create feature store session
feature_store_session = Session(boto_session=boto_session, sagemaker_client=sagemaker_client, sagemaker_featurestore_runtime_client=featurestore_runtime)

### Loading Data and Partitioning it into DataGroups

In [8]:
# Reading in the data
housing_df = pd.read_csv('housing.csv')
housing_gmaps_df =  pd.read_csv('housing_gmaps_data_raw.csv')

In [9]:
housing_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [10]:
housing_gmaps_df.head()

Unnamed: 0,street_number,route,locality-political,administrative_area_level_2-political,administrative_area_level_1-political,country-political,postal_code,address,longitude,latitude,...,establishment-natural_feature,airport-establishment-point_of_interest,political-sublocality-sublocality_level_1,administrative_area_level_3-political,post_box,establishment-light_rail_station-point_of_interest-transit_station,establishment-point_of_interest,aquarium-establishment-park-point_of_interest-tourist_attraction-zoo,campground-establishment-lodging-park-point_of_interest-rv_park-tourist_attraction,cemetery-establishment-park-point_of_interest
0,3130,Grizzly Peak Boulevard,Berkeley,Alameda County,California,United States,94705.0,"3130 Grizzly Peak Blvd, Berkeley, CA 94705, USA",-122.23,37.88,...,,,,,,,,,,
1,2005,Tunnel Road,Oakland,Alameda County,California,United States,94611.0,"2005 Tunnel Rd, Oakland, CA 94611, USA",-122.22,37.86,...,,,,,,,,,,
2,6886,Chabot Road,Oakland,Alameda County,California,United States,94618.0,"6886 Chabot Rd, Oakland, CA 94618, USA",-122.24,37.85,...,,,,,,,,,,
3,6365,Florio Street,Oakland,Alameda County,California,United States,94618.0,"6365 Florio St, Oakland, CA 94618, USA",-122.25,37.85,...,,,,,,,,,,
4,5407,Bryant Avenue,Oakland,Alameda County,California,United States,94618.0,"5407 Bryant Ave, Oakland, CA 94618, USA",-122.25,37.84,...,,,,,,,,,,


In [11]:
housing_df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [12]:
housing_gmaps_df.describe()

Unnamed: 0,postal_code,longitude,latitude,postal_code_suffix
count,12410.0,12590.0,12590.0,7999.0
mean,93348.943836,-119.676724,35.895577,4177.914614
std,1765.572652,2.042677,2.219248,2474.063791
min,85344.0,-124.35,32.54,110.0
25%,92054.0,-121.76,33.97,2230.5
50%,93301.0,-119.27,35.34,3556.0
75%,95050.0,-117.95,37.81,5529.0
max,96161.0,-114.31,41.95,9859.0


In [13]:
df = pd.merge(housing_gmaps_df, housing_df, left_on=['longitude', 'latitude'], right_on=['longitude', 'latitude'], how='right')

In [14]:
df.head()

Unnamed: 0,street_number,route,locality-political,administrative_area_level_2-political,administrative_area_level_1-political,country-political,postal_code,address,longitude,latitude,...,campground-establishment-lodging-park-point_of_interest-rv_park-tourist_attraction,cemetery-establishment-park-point_of_interest,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,3130,Grizzly Peak Boulevard,Berkeley,Alameda County,California,United States,94705.0,"3130 Grizzly Peak Blvd, Berkeley, CA 94705, USA",-122.23,37.88,...,,,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,2005,Tunnel Road,Oakland,Alameda County,California,United States,94611.0,"2005 Tunnel Rd, Oakland, CA 94611, USA",-122.22,37.86,...,,,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,6886,Chabot Road,Oakland,Alameda County,California,United States,94618.0,"6886 Chabot Rd, Oakland, CA 94618, USA",-122.24,37.85,...,,,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,6365,Florio Street,Oakland,Alameda County,California,United States,94618.0,"6365 Florio St, Oakland, CA 94618, USA",-122.25,37.85,...,,,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,6365,Florio Street,Oakland,Alameda County,California,United States,94618.0,"6365 Florio St, Oakland, CA 94618, USA",-122.25,37.85,...,,,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [15]:
# Priority Key
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 38 columns):
 #   Column                                                                              Non-Null Count  Dtype  
---  ------                                                                              --------------  -----  
 0   street_number                                                                       19008 non-null  object 
 1   route                                                                               20091 non-null  object 
 2   locality-political                                                                  20452 non-null  object 
 3   administrative_area_level_2-political                                               20589 non-null  object 
 4   administrative_area_level_1-political                                               20637 non-null  object 
 5   country-political                                                                   20640 non-n

In [16]:
# Grabbing the features for our group
feature_cols = ['neighborhood-political',
                'ocean_proximity',
                'median_house_value',
                'housing_median_age',
                'households',
                'total_bedrooms',
                'locality-political']

# Creating new df based on the destired features
feature_df = df[feature_cols]

# Dropping an null values based on the primary key
feature_df = feature_df.dropna(subset='neighborhood-political')

# Renaming some of the columss for simplicity sake
feature_df = feature_df.rename(columns={'neighborhood-political':'nbh_pol',
                                        'locality-political': 'loc_pol',
                                        'ocean_proximity':'ocn_prox',
                                        'median_house_value': 'med_hse_val',
                                        'housing_median_age': 'hse_med_age',
                                        'households': 'tot_house',
                                        'total_bedrooms': 'tot_bed'}
                              )
                

In [17]:
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9000 entries, 1 to 20636
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   nbh_pol      9000 non-null   object 
 1   ocn_prox     9000 non-null   object 
 2   med_hse_val  9000 non-null   float64
 3   hse_med_age  9000 non-null   float64
 4   tot_house    9000 non-null   float64
 5   tot_bed      8911 non-null   float64
 6   loc_pol      8955 non-null   object 
dtypes: float64(4), object(3)
memory usage: 562.5+ KB


In [18]:
# Households becase on locality
house_df = feature_df[['loc_pol', 
                      'tot_house', 
                      'tot_bed']]

# Finding average for locality Code
house_df = house_df.groupby('loc_pol').mean()

# Renaming total to averages for average colculation
house_df = house_df.rename(columns={'tot_bed': 'avg_bed',
                                    'tot_house': 'avg_house'}
                          )

# Finding average bedrooms per household
house_df['avg_bed_per_house'] = house_df['avg_bed'].div(house_df['avg_house'], axis=0).round()

# Merge the new df back into the feature_df
feature_df = pd.merge(feature_df, house_df, left_on=['loc_pol'], right_on=['loc_pol'], how='left')

In [19]:
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   nbh_pol            9000 non-null   object 
 1   ocn_prox           9000 non-null   object 
 2   med_hse_val        9000 non-null   float64
 3   hse_med_age        9000 non-null   float64
 4   tot_house          9000 non-null   float64
 5   tot_bed            8911 non-null   float64
 6   loc_pol            8955 non-null   object 
 7   avg_house          8955 non-null   float64
 8   avg_bed            8954 non-null   float64
 9   avg_bed_per_house  8954 non-null   float64
dtypes: float64(7), object(3)
memory usage: 703.2+ KB


In [20]:
# Encoding datatypes from objects to floats

# One hot encode ocn_prox
encode_ocn_prox = pd.get_dummies(feature_df['ocn_prox'], dtype='float64')

# Encode Locality
encode_loc_pol, dict_pairs = encode_col(feature_df,'loc_pol')

# Combine the two encoded columns together
combined_cols = pd.concat([encode_loc_pol,encode_ocn_prox, ], axis=1)

In [21]:
combined_cols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   nbh_pol            9000 non-null   object 
 1   ocn_prox           9000 non-null   object 
 2   med_hse_val        9000 non-null   float64
 3   hse_med_age        9000 non-null   float64
 4   tot_house          9000 non-null   float64
 5   tot_bed            8911 non-null   float64
 6   loc_pol            9000 non-null   float64
 7   avg_house          8955 non-null   float64
 8   avg_bed            8954 non-null   float64
 9   avg_bed_per_house  8954 non-null   float64
 10  <1H OCEAN          9000 non-null   float64
 11  INLAND             9000 non-null   float64
 12  NEAR BAY           9000 non-null   float64
 13  NEAR OCEAN         9000 non-null   float64
dtypes: float64(12), object(2)
memory usage: 984.5+ KB


In [22]:
# Now that we got what we need form ocn_prox, we can drop that column
combined_cols = combined_cols.drop(columns=['ocn_prox'])

In [23]:
combined_cols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   nbh_pol            9000 non-null   object 
 1   med_hse_val        9000 non-null   float64
 2   hse_med_age        9000 non-null   float64
 3   tot_house          9000 non-null   float64
 4   tot_bed            8911 non-null   float64
 5   loc_pol            9000 non-null   float64
 6   avg_house          8955 non-null   float64
 7   avg_bed            8954 non-null   float64
 8   avg_bed_per_house  8954 non-null   float64
 9   <1H OCEAN          9000 non-null   float64
 10  INLAND             9000 non-null   float64
 11  NEAR BAY           9000 non-null   float64
 12  NEAR OCEAN         9000 non-null   float64
dtypes: float64(12), object(1)
memory usage: 914.2+ KB


In [24]:
# Rename and Group the neighboorhoods and create an index out of them
new_df = combined_cols.groupby('nbh_pol').mean()

In [25]:
new_df.head()

Unnamed: 0_level_0,med_hse_val,hse_med_age,tot_house,tot_bed,loc_pol,avg_house,avg_bed,avg_bed_per_house,<1H OCEAN,INLAND,NEAR BAY,NEAR OCEAN
nbh_pol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
28 Palms,222200.0,25.0,923.0,939.0,5.0,863.238806,894.686567,1.0,1.0,0.0,0.0,0.0
Acorn Industrial,81300.0,52.0,147.0,244.0,0.0,370.966197,397.541076,1.0,0.0,0.0,1.0,0.0
Adams Hill,250733.333333,39.5,493.666667,520.166667,36.0,579.542056,614.6,1.0,1.0,0.0,0.0,0.0
Agua Mansa Industrial Corridor,112300.0,17.0,516.0,569.0,138.0,516.0,569.0,1.0,0.0,1.0,0.0,0.0
Al Tahoe,109180.0,23.8,248.8,399.8,20.0,248.8,399.8,2.0,0.0,1.0,0.0,0.0


In [26]:
# Rename
new_df = new_df.reset_index().rename(columns={'index': 'nbh_pol',
                                              '<1H OCEAN': 'ls_1_ocn',
                                              'INLAND': 'inland',
                                              'NEAR BAY': 'nr_bay',
                                              'NEAR OCEAN': 'nr_ocn'}
                                    )

In [27]:
new_df.head()

Unnamed: 0,nbh_pol,med_hse_val,hse_med_age,tot_house,tot_bed,loc_pol,avg_house,avg_bed,avg_bed_per_house,ls_1_ocn,inland,nr_bay,nr_ocn
0,28 Palms,222200.0,25.0,923.0,939.0,5.0,863.238806,894.686567,1.0,1.0,0.0,0.0,0.0
1,Acorn Industrial,81300.0,52.0,147.0,244.0,0.0,370.966197,397.541076,1.0,0.0,0.0,1.0,0.0
2,Adams Hill,250733.333333,39.5,493.666667,520.166667,36.0,579.542056,614.6,1.0,1.0,0.0,0.0,0.0
3,Agua Mansa Industrial Corridor,112300.0,17.0,516.0,569.0,138.0,516.0,569.0,1.0,0.0,1.0,0.0,0.0
4,Al Tahoe,109180.0,23.8,248.8,399.8,20.0,248.8,399.8,2.0,0.0,1.0,0.0,0.0


In [28]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306 entries, 0 to 1305
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   nbh_pol            1306 non-null   object 
 1   med_hse_val        1306 non-null   float64
 2   hse_med_age        1306 non-null   float64
 3   tot_house          1306 non-null   float64
 4   tot_bed            1300 non-null   float64
 5   loc_pol            1306 non-null   float64
 6   avg_house          1293 non-null   float64
 7   avg_bed            1292 non-null   float64
 8   avg_bed_per_house  1292 non-null   float64
 9   ls_1_ocn           1306 non-null   float64
 10  inland             1306 non-null   float64
 11  nr_bay             1306 non-null   float64
 12  nr_ocn             1306 non-null   float64
dtypes: float64(12), object(1)
memory usage: 132.8+ KB


In [29]:
# Encode the nbh_pol
# Creating a dataframe
encode_df = pd.DataFrame.from_dict(dict_pairs, orient='index')

# Reset the index and name the column
encode_df = encode_df.reset_index().rename(columns={'index': 'nbh_pol_new',
                                                    0: 'nbh_pol_encode'}
                                          )
# encode_df.info()

# Ensure DF for the encoded values are the same                                 
encode_df['nbh_pol_encode'] = encode_df['nbh_pol_encode'].astype('float64')

In [30]:
encode_df

Unnamed: 0,nbh_pol_new,nbh_pol_encode
0,Oakland,0.0
1,Berkeley,1.0
2,San Leandro,2.0
3,Alameda,3.0
4,Hayward,4.0
...,...,...
200,Porterville,200.0
201,Ventura,201.0
202,Oxnard,202.0
203,Thousand Oaks,203.0


In [31]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306 entries, 0 to 1305
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   nbh_pol            1306 non-null   object 
 1   med_hse_val        1306 non-null   float64
 2   hse_med_age        1306 non-null   float64
 3   tot_house          1306 non-null   float64
 4   tot_bed            1300 non-null   float64
 5   loc_pol            1306 non-null   float64
 6   avg_house          1293 non-null   float64
 7   avg_bed            1292 non-null   float64
 8   avg_bed_per_house  1292 non-null   float64
 9   ls_1_ocn           1306 non-null   float64
 10  inland             1306 non-null   float64
 11  nr_bay             1306 non-null   float64
 12  nr_ocn             1306 non-null   float64
dtypes: float64(12), object(1)
memory usage: 132.8+ KB


In [32]:
# Calculating the bedrooms per houseold
new_df['bed_per_hse'] = new_df['tot_bed'].div(new_df['tot_house'], axis=0)

In [33]:
new_df.isna().count()

nbh_pol              1306
med_hse_val          1306
hse_med_age          1306
tot_house            1306
tot_bed              1306
loc_pol              1306
avg_house            1306
avg_bed              1306
avg_bed_per_house    1306
ls_1_ocn             1306
inland               1306
nr_bay               1306
nr_ocn               1306
bed_per_hse          1306
dtype: int64

In [34]:
# Checking out the cities to add
new_df[new_df['nbh_pol']=='Brooktree'], new_df[new_df['nbh_pol']== "Fisherman's Wharf"], new_df[new_df['nbh_pol']=='Los Osos']

(       nbh_pol  med_hse_val  hse_med_age  tot_house  tot_bed  loc_pol  \
 130  Brooktree     257400.0          9.0     1438.0      NaN    182.0   
 
       avg_house     avg_bed  avg_bed_per_house  ls_1_ocn  inland  nr_bay  \
 130  532.506148  548.538144                1.0       1.0     0.0     0.0   
 
      nr_ocn  bed_per_hse  
 130     0.0          NaN  ,
                nbh_pol  med_hse_val  hse_med_age  tot_house  tot_bed  loc_pol  \
 390  Fisherman's Wharf     500001.0         52.0      250.0    317.0    160.0   
 
      avg_house     avg_bed  avg_bed_per_house  ls_1_ocn  inland  nr_bay  \
 390      501.0  535.384899                1.0       0.0     0.0     1.0   
 
      nr_ocn  bed_per_hse  
 390     0.0        1.268  ,
       nbh_pol  med_hse_val  hse_med_age  tot_house  tot_bed  loc_pol  \
 604  Los Osos     221612.5       15.375     611.75    642.5    163.0   
 
      avg_house  avg_bed  avg_bed_per_house  ls_1_ocn  inland  nr_bay  nr_ocn  \
 604     611.75    642.5       

### Ingest Data into Feature Store + Setup Feature Group

In [35]:
# Creating the names and the time-stamp
neighborhood_feature_group_name = "neighborhood-feature-group-" + strftime("%d-%H-%M-%S", gmtime())
encoded_feature_group_name = "encoded-feature-group-" + strftime("%d-%H-%M-%S", gmtime())

In [36]:
# Creating Feature Group
neighborhood_feature_group = FeatureGroup(name=neighborhood_feature_group_name, sagemaker_session=feature_store_session)
encoded_feature_group = FeatureGroup(name=encoded_feature_group_name, sagemaker_session=feature_store_session)

In [37]:
# Make all objects intro string
convert_to_strings(new_df)
convert_to_strings(encode_df)

### Setup Record Identifier and Event Time Features

In [38]:
# Creating record identifier + time features
primary_key_identifier = 'nbh_pol'
secondary_key_identifier = 'nbh_pol_new' # pol_loc
event_time_identifier = 'event_time'

# Tack on the event time to the two df
new_df[event_time_identifier] = pd.Series([time_now]*len(new_df), dtype='float64')
encode_df[event_time_identifier] = pd.Series([time_now]*len(encode_df), dtype='float64')

In [39]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306 entries, 0 to 1305
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   nbh_pol            1306 non-null   object 
 1   med_hse_val        1306 non-null   float64
 2   hse_med_age        1306 non-null   float64
 3   tot_house          1306 non-null   float64
 4   tot_bed            1300 non-null   float64
 5   loc_pol            1306 non-null   float64
 6   avg_house          1293 non-null   float64
 7   avg_bed            1292 non-null   float64
 8   avg_bed_per_house  1292 non-null   float64
 9   ls_1_ocn           1306 non-null   float64
 10  inland             1306 non-null   float64
 11  nr_bay             1306 non-null   float64
 12  nr_ocn             1306 non-null   float64
 13  bed_per_hse        1300 non-null   float64
 14  event_time         1306 non-null   float64
dtypes: float64(14), object(1)
memory usage: 153.2+ KB


In [40]:
encode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   nbh_pol_new     205 non-null    object 
 1   nbh_pol_encode  205 non-null    float64
 2   event_time      205 non-null    float64
dtypes: float64(2), object(1)
memory usage: 4.9+ KB


### Load Feature Defintions

In [41]:
neighborhood_feature_group.load_feature_definitions(data_frame=new_df)

[FeatureDefinition(feature_name='nbh_pol', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='med_hse_val', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='hse_med_age', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='tot_house', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='tot_bed', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='loc_pol', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='avg_house', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='avg_bed', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefi

In [42]:
encoded_feature_group.load_feature_definitions(data_frame=encode_df)

[FeatureDefinition(feature_name='nbh_pol_new', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='nbh_pol_encode', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='event_time', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None)]

### Create a Feature Group

In [43]:
neighborhood_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=primary_key_identifier,
    event_time_feature_name=event_time_identifier,
    role_arn=role,
    enable_online_store=True,
)

encoded_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=secondary_key_identifier,
    event_time_feature_name=event_time_identifier,
    role_arn=role,
    enable_online_store=True,
)

wait_for_feature_group_creation_complete(feature_group=neighborhood_feature_group)
wait_for_feature_group_creation_complete(feature_group=encoded_feature_group)

Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup neighborhood-feature-group-24-20-56-29 successfully created.
Waiting for Feature Group Creation
FeatureGroup encoded-feature-group-24-20-56-29 successfully created.


### Checking Results + Putting Records In

In [44]:
neighborhood_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:004608622582:feature-group/neighborhood-feature-group-24-20-56-29',
 'FeatureGroupName': 'neighborhood-feature-group-24-20-56-29',
 'RecordIdentifierFeatureName': 'nbh_pol',
 'EventTimeFeatureName': 'event_time',
 'FeatureDefinitions': [{'FeatureName': 'nbh_pol', 'FeatureType': 'String'},
  {'FeatureName': 'med_hse_val', 'FeatureType': 'Fractional'},
  {'FeatureName': 'hse_med_age', 'FeatureType': 'Fractional'},
  {'FeatureName': 'tot_house', 'FeatureType': 'Fractional'},
  {'FeatureName': 'tot_bed', 'FeatureType': 'Fractional'},
  {'FeatureName': 'loc_pol', 'FeatureType': 'Fractional'},
  {'FeatureName': 'avg_house', 'FeatureType': 'Fractional'},
  {'FeatureName': 'avg_bed', 'FeatureType': 'Fractional'},
  {'FeatureName': 'avg_bed_per_house', 'FeatureType': 'Fractional'},
  {'FeatureName': 'ls_1_ocn', 'FeatureType': 'Fractional'},
  {'FeatureName': 'inland', 'FeatureType': 'Fractional'},
  {'FeatureName': 'nr_bay', 'FeatureType': 'Fract

In [46]:
encoded_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:004608622582:feature-group/encoded-feature-group-24-20-56-29',
 'FeatureGroupName': 'encoded-feature-group-24-20-56-29',
 'RecordIdentifierFeatureName': 'nbh_pol_new',
 'EventTimeFeatureName': 'event_time',
 'FeatureDefinitions': [{'FeatureName': 'nbh_pol_new',
   'FeatureType': 'String'},
  {'FeatureName': 'nbh_pol_encode', 'FeatureType': 'Fractional'},
  {'FeatureName': 'event_time', 'FeatureType': 'Fractional'}],
 'CreationTime': datetime.datetime(2024, 5, 24, 20, 56, 32, 482000, tzinfo=tzlocal()),
 'OnlineStoreConfig': {'EnableOnlineStore': True},
 'OfflineStoreConfig': {'S3StorageConfig': {'S3Uri': 's3://sagemaker-us-east-1-004608622582/sagemaker-featurestore-demo',
   'ResolvedOutputS3Uri': 's3://sagemaker-us-east-1-004608622582/sagemaker-featurestore-demo/004608622582/sagemaker/us-east-1/offline-store/encoded-feature-group-24-20-56-29-1716584192/data'},
  'DisableGlueTableCreation': False,
  'DataCatalogConfig': {'TableName': 'enc

In [48]:
neighborhood_feature_group.ingest(data_frame=new_df, max_workers=3, wait=True)

IngestionManagerPandas(feature_group_name='neighborhood-feature-group-24-20-56-29', feature_definitions={'nbh_pol': {'FeatureName': 'nbh_pol', 'FeatureType': 'String'}, 'med_hse_val': {'FeatureName': 'med_hse_val', 'FeatureType': 'Fractional'}, 'hse_med_age': {'FeatureName': 'hse_med_age', 'FeatureType': 'Fractional'}, 'tot_house': {'FeatureName': 'tot_house', 'FeatureType': 'Fractional'}, 'tot_bed': {'FeatureName': 'tot_bed', 'FeatureType': 'Fractional'}, 'loc_pol': {'FeatureName': 'loc_pol', 'FeatureType': 'Fractional'}, 'avg_house': {'FeatureName': 'avg_house', 'FeatureType': 'Fractional'}, 'avg_bed': {'FeatureName': 'avg_bed', 'FeatureType': 'Fractional'}, 'avg_bed_per_house': {'FeatureName': 'avg_bed_per_house', 'FeatureType': 'Fractional'}, 'ls_1_ocn': {'FeatureName': 'ls_1_ocn', 'FeatureType': 'Fractional'}, 'inland': {'FeatureName': 'inland', 'FeatureType': 'Fractional'}, 'nr_bay': {'FeatureName': 'nr_bay', 'FeatureType': 'Fractional'}, 'nr_ocn': {'FeatureName': 'nr_ocn', 'Feat

In [49]:
encoded_feature_group.ingest(data_frame=encode_df, max_workers=5, wait=True)

IngestionManagerPandas(feature_group_name='encoded-feature-group-24-20-56-29', feature_definitions={'nbh_pol_new': {'FeatureName': 'nbh_pol_new', 'FeatureType': 'String'}, 'nbh_pol_encode': {'FeatureName': 'nbh_pol_encode', 'FeatureType': 'Fractional'}, 'event_time': {'FeatureName': 'event_time', 'FeatureType': 'Fractional'}}, sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7f568d93b100>, sagemaker_session=<sagemaker.session.Session object at 0x7f568d9d4730>, max_workers=5, max_processes=1, profile_name=None, _async_result=<multiprocess.pool.MapResult object at 0x7f568d9663e0>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

In [52]:
### Grabbing the Record from the online store
record_identifier_value = 'Brooktree'

featurestore_runtime.get_record(
    FeatureGroupName=neighborhood_feature_group_name,
    RecordIdentifierValueAsString=record_identifier_value,
)

{'ResponseMetadata': {'RequestId': '524106e0-3df5-48df-bfe4-0b5ca3f12fc3',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '524106e0-3df5-48df-bfe4-0b5ca3f12fc3',
   'content-type': 'application/json',
   'content-length': '1054',
   'date': 'Fri, 24 May 2024 21:01:11 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'nbh_pol', 'ValueAsString': 'Brooktree'},
  {'FeatureName': 'med_hse_val', 'ValueAsString': '257400.0'},
  {'FeatureName': 'hse_med_age', 'ValueAsString': '9.0'},
  {'FeatureName': 'tot_house', 'ValueAsString': '1438.0'},
  {'FeatureName': 'loc_pol', 'ValueAsString': '182.0'},
  {'FeatureName': 'avg_house', 'ValueAsString': '532.5061475409836'},
  {'FeatureName': 'avg_bed', 'ValueAsString': '548.5381443298969'},
  {'FeatureName': 'avg_bed_per_house', 'ValueAsString': '1.0'},
  {'FeatureName': 'ls_1_ocn', 'ValueAsString': '1.0'},
  {'FeatureName': 'inland', 'ValueAsString': '0.0'},
  {'FeatureName': 'nr_bay', 'ValueAsString': '0.0'},
  {'FeatureName':

In [53]:
record_identifier_value = "Fisherman's Wharf"

featurestore_runtime.get_record(
    FeatureGroupName=neighborhood_feature_group_name,
    RecordIdentifierValueAsString=record_identifier_value,
)

{'ResponseMetadata': {'RequestId': '00c5eaad-81ae-41c5-a99d-c4084c43bded',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '00c5eaad-81ae-41c5-a99d-c4084c43bded',
   'content-type': 'application/json',
   'content-length': '1204',
   'date': 'Fri, 24 May 2024 21:01:13 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'nbh_pol', 'ValueAsString': "Fisherman's Wharf"},
  {'FeatureName': 'med_hse_val', 'ValueAsString': '500001.0'},
  {'FeatureName': 'hse_med_age', 'ValueAsString': '52.0'},
  {'FeatureName': 'tot_house', 'ValueAsString': '250.0'},
  {'FeatureName': 'tot_bed', 'ValueAsString': '317.0'},
  {'FeatureName': 'loc_pol', 'ValueAsString': '160.0'},
  {'FeatureName': 'avg_house', 'ValueAsString': '501.0'},
  {'FeatureName': 'avg_bed', 'ValueAsString': '535.3848987108655'},
  {'FeatureName': 'avg_bed_per_house', 'ValueAsString': '1.0'},
  {'FeatureName': 'ls_1_ocn', 'ValueAsString': '0.0'},
  {'FeatureName': 'inland', 'ValueAsString': '0.0'},
  {'FeatureName': 

In [54]:
record_identifier_value = 'Los Osos'

featurestore_runtime.get_record(
    FeatureGroupName=neighborhood_feature_group_name,
    RecordIdentifierValueAsString=record_identifier_value,
)

{'ResponseMetadata': {'RequestId': '010e5a93-7e39-4eca-95e4-337f31c914aa',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '010e5a93-7e39-4eca-95e4-337f31c914aa',
   'content-type': 'application/json',
   'content-length': '1200',
   'date': 'Fri, 24 May 2024 21:01:15 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'nbh_pol', 'ValueAsString': 'Los Osos'},
  {'FeatureName': 'med_hse_val', 'ValueAsString': '221612.5'},
  {'FeatureName': 'hse_med_age', 'ValueAsString': '15.375'},
  {'FeatureName': 'tot_house', 'ValueAsString': '611.75'},
  {'FeatureName': 'tot_bed', 'ValueAsString': '642.5'},
  {'FeatureName': 'loc_pol', 'ValueAsString': '163.0'},
  {'FeatureName': 'avg_house', 'ValueAsString': '611.75'},
  {'FeatureName': 'avg_bed', 'ValueAsString': '642.5'},
  {'FeatureName': 'avg_bed_per_house', 'ValueAsString': '1.0'},
  {'FeatureName': 'ls_1_ocn', 'ValueAsString': '0.0'},
  {'FeatureName': 'inland', 'ValueAsString': '0.0'},
  {'FeatureName': 'nr_bay', 'ValueA

In [45]:
a

NameError: name 'a' is not defined

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>