In [1]:
import boto3
import sagemaker
from sagemaker.feature_store.feature_group import FeatureGroup

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
import os
import s3fs
from datasets import load_dataset, load_dataset_builder
import pandas as pd
import re

In [3]:
account_id = boto3.client("sts").get_caller_identity()["Account"]
print(account_id)

597161074694


## Instantiating Sagemaker Feature Group / Database Object

In [4]:
from sagemaker.session import Session

region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

In [5]:
feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

In [6]:
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "tinystories"

In [7]:
train_fg = 'tinystories-train'
val_fg = 'tinystories-val'

In [8]:
train_feature_group = FeatureGroup(
    name=train_fg, sagemaker_session=feature_store_session
)
val_feature_group = FeatureGroup(
    name=val_fg, sagemaker_session=feature_store_session
)

## Preparing the Data from HuggingFace

In [9]:
hf_repo = 'Alexis-Az/TinyStories'

In [10]:
train_df = load_dataset(hf_repo, revision='refs/convert/parquet', data_dir='default/train')

In [11]:
val_df = load_dataset(hf_repo, revision='refs/convert/parquet', data_dir='default/validation')

In [12]:
val_df

DatasetDict({
    train: Dataset({
        features: ['prompt', 'source', 'story', 'summary', 'unique_id', 'timestamp', 'string_words', 'string_features'],
        num_rows: 549020
    })
})

In [13]:
train_df = pd.DataFrame(train_df['train'])
val_df = pd.DataFrame(val_df['train'])

### Adding the data type identifiers for the pandas columns

In [14]:
def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")

In [15]:
cast_object_to_string(train_df)
cast_object_to_string(val_df)

In [16]:
import time
current_time_sec = int(round(time.time()))

In [17]:
train_df["timestamp"] = pd.Series([current_time_sec]*len(train_df), dtype="float64")
val_df["timestamp"] = pd.Series([current_time_sec]*len(val_df), dtype="float64")

In [18]:
train_feature_group.load_feature_definitions(train_df)
val_feature_group.load_feature_definitions(val_df)

[FeatureDefinition(feature_name='prompt', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='source', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='story', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='summary', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='unique_id', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='timestamp', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='string_words', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None),
 FeatureDefinition(feature_name='string_features', feature_type=<FeatureTypeEnum.STRING: 'String'>, collection_type=None)]

In [19]:
def clean_strings(df):
    for label in df.columns:
        if df.dtypes[label] == "string":
            df[label] = df[label].apply(
                lambda x: re.sub('\n', ' ', str(x)))

In [20]:
clean_strings(train_df)
clean_strings(val_df)

## Linking the Data from HF to the Database

In [21]:
from sagemaker import get_execution_role

# You can modify the following to use a role of your choosing. See the documentation for how to create this.
role = get_execution_role()

In [22]:
record_identifier_feature_name = "unique_id"
event_time_feature_name = "timestamp"

In [28]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")


train_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

val_feature_group.create(
    s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

wait_for_feature_group_creation_complete(feature_group=train_feature_group)
wait_for_feature_group_creation_complete(feature_group=val_feature_group)

Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup tinystories-train successfully created.
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup tinystories-val successfully created.


In [24]:
output_dir = f"s3://{default_s3_bucket_name}/{prefix}"
output_dir

's3://sagemaker-us-east-1-597161074694/tinystories'

### Uploading to AWS GlueDB

In [31]:
#saving the data from the huggingface repo to aws
train_feature_group.ingest(data_frame=train_df, max_workers=100, wait=True)

IngestionManagerPandas(feature_group_name='tinystories-train', feature_definitions={'prompt': {'FeatureName': 'prompt', 'FeatureType': 'String'}, 'source': {'FeatureName': 'source', 'FeatureType': 'String'}, 'story': {'FeatureName': 'story', 'FeatureType': 'String'}, 'summary': {'FeatureName': 'summary', 'FeatureType': 'String'}, 'unique_id': {'FeatureName': 'unique_id', 'FeatureType': 'Integral'}, 'timestamp': {'FeatureName': 'timestamp', 'FeatureType': 'Fractional'}, 'string_features': {'FeatureName': 'string_features', 'FeatureType': 'String'}, 'string_words': {'FeatureName': 'string_words', 'FeatureType': 'String'}}, sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7f4b498dccd0>, sagemaker_session=<sagemaker.session.Session object at 0x7f4b498df1d0>, max_workers=100, max_processes=1, profile_name=None, _async_result=<multiprocess.pool.MapResult object at 0x7f4a25bea9d0>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

In [32]:
#saving the data from the huggingface repo to aws
val_feature_group.ingest(data_frame=val_df, max_workers=100, wait=True)

IngestionManagerPandas(feature_group_name='tinystories-val', feature_definitions={'prompt': {'FeatureName': 'prompt', 'FeatureType': 'String'}, 'source': {'FeatureName': 'source', 'FeatureType': 'String'}, 'story': {'FeatureName': 'story', 'FeatureType': 'String'}, 'summary': {'FeatureName': 'summary', 'FeatureType': 'String'}, 'unique_id': {'FeatureName': 'unique_id', 'FeatureType': 'Integral'}, 'timestamp': {'FeatureName': 'timestamp', 'FeatureType': 'Fractional'}, 'string_words': {'FeatureName': 'string_words', 'FeatureType': 'String'}, 'string_features': {'FeatureName': 'string_features', 'FeatureType': 'String'}}, sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7f4b498dccd0>, sagemaker_session=<sagemaker.session.Session object at 0x7f4b498df1d0>, max_workers=100, max_processes=1, profile_name=None, _async_result=<multiprocess.pool.MapResult object at 0x7f4ab8f39150>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

## Sample Query of the Data

In [33]:
train_query = train_feature_group.athena_query()
train_table = train_query.table_name

In [34]:
val_query = val_feature_group.athena_query()
val_table = val_query.table_name

In [35]:
query_string = (
    'SELECT * FROM "'
    + train_table
    + '"LIMIT 100;'
)
print("Running " + query_string)

Running SELECT * FROM "tinystories_train_1739832791"LIMIT 100;


In [36]:
train_query.run(
    query_string=query_string,
    output_location="s3://" + default_s3_bucket_name + "/" + prefix + "/query_results/",
)
train_query.wait()
dataset = train_query.as_dataframe()

In [38]:
dataset.sample(5)

Unnamed: 0,prompt,source,story,summary,unique_id,timestamp,string_features,string_words,write_time,api_invocation_time,is_deleted
42,Write a short story (3-5 paragraphs) which onl...,GPT-4,"Once upon a time, there was a man and a woman....",A married couple has fights but decides to sav...,8590208090,1739831000.0,narrative features: Dialogue,"vocabulary features: save, marriage, tidy",2025-02-17 23:56:07.941,2025-02-17 23:51:09.000,False
49,Write a short story (3-5 paragraphs) which onl...,GPT-4,"Once upon a time, there was a little brown dog...","Buddy, a little brown dog, helps his family by...",25769817194,1739831000.0,"narrative features: Dialogue, Conflict, MoralV...","vocabulary features: fix, emergency, brown",2025-02-17 23:56:07.941,2025-02-17 23:51:10.000,False
44,Write a short story (3-5 paragraphs) which onl...,GPT-4,Lila and Ben liked to play in the garden. They...,"Lila and Ben raised a dragon named Flap, but h...",8589977991,1739831000.0,narrative features:,"vocabulary features: raise, dragon, hungry",2025-02-17 23:56:07.941,2025-02-17 23:51:10.000,False
29,Write a short story (3-5 paragraphs) which onl...,GPT-4,Sara loved to play with her dolls. She had man...,Sara steals a magic wand from a store and acci...,17179967172,1739831000.0,"narrative features: MoralValue, Foreshadowing","vocabulary features: shrink, spirit, ashamed",2025-02-17 23:56:07.941,2025-02-17 23:51:09.000,False
22,Write a short story (3-5 paragraphs) which onl...,GPT-4,Lily and Ben are twins. They like to play with...,Lily and Ben try cauliflower soup for the firs...,8590068237,1739831000.0,narrative features: Dialogue,"vocabulary features: do, cauliflower, comfortable",2025-02-17 23:56:07.941,2025-02-17 23:51:08.000,False


In [39]:
#the names of the tables in gluedb
print(train_table)
print(val_table)

tinystories_train_1739832791
tinystories_val_1739832793
