In [10]:
import boto3
import sagemaker
from sagemaker.feature_store.feature_group import FeatureGroup

In [11]:
account_id = boto3.client("sts").get_caller_identity()["Account"]
print(account_id)

597161074694


## Instantiating Sagemaker Feature Group / Database Object

In [12]:
from sagemaker.session import Session

region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

In [16]:
feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

In [17]:
default_s3_bucket_name = feature_store_session.default_bucket()
prefix = "tinystories"

In [18]:
train_fg = 'tinystories-train'
val_fg = 'tinystories-val'

In [19]:
train_feature_group = FeatureGroup(
    name=train_fg, sagemaker_session=feature_store_session
)
val_feature_group = FeatureGroup(
    name=val_fg, sagemaker_session=feature_store_session
)

## Linking the Data from HF to the Database

## Sample Query of the Data

In [20]:
train_query = train_feature_group.athena_query()
train_table = train_query.table_name

In [21]:
val_query = val_feature_group.athena_query()
val_table = val_query.table_name

In [22]:
query_string = (
    'SELECT * FROM "'
    + train_table
    + '"LIMIT 100;'
)
print("Running " + query_string)

Running SELECT * FROM "tinystories_train_1739823439"LIMIT 100;


In [23]:
train_query.run(
    query_string=query_string,
    output_location="s3://" + default_s3_bucket_name + "/" + prefix + "/query_results/",
)
train_query.wait()
dataset = train_query.as_dataframe()

In [24]:
dataset

Unnamed: 0,prompt,source,story,summary,unique_id,timestamp,string_features,string_words,write_time,api_invocation_time,is_deleted


In [19]:
featurestore_runtime.batch_get_record(
    Identifiers=[
        {
            "FeatureGroupName": train_fg,
            "RecordIdentifiersValueAsString": ["1"],
        },
        {
            "FeatureGroupName": val_fg,
            "RecordIdentifiersValueAsString": ["1"],
        },
    ]
).items()

dict_items([('ResponseMetadata', {'RequestId': 'e36f9924-6701-4b2d-8173-f8d8e64baecb', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'e36f9924-6701-4b2d-8173-f8d8e64baecb', 'content-type': 'application/json', 'content-length': '54', 'date': 'Fri, 14 Feb 2025 03:32:12 GMT'}, 'RetryAttempts': 0}), ('Records', []), ('Errors', []), ('UnprocessedIdentifiers', [])])