In [74]:
import sagemaker
from sagemaker import get_execution_role
import boto3
import os
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup

bucket = sagemaker.Session().default_bucket()
prefix = 'sagemaker/DEMO-xgboost-dm'
# Define IAM role
role = get_execution_role()

# Get the data from SageMaker Feature Store

In [75]:
region = boto3.Session().region_name
boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

feature_group_name = "FG-dataflow-8defd904"  # replace with your feature group name 
feature_group = FeatureGroup(name=feature_group_name, sagemaker_session=feature_store_session)

In [76]:
# Build SQL query to features group
fs_query = feature_group.athena_query()
fs_table = fs_query.table_name
query_string = 'SELECT * FROM "'+fs_table+'"'
print('Running ' + query_string)

Running SELECT * FROM "fg_dataflow_8defd904_1729124938"


In [77]:
# Run Athena query. The output is loaded to a Pandas dataframe.
fs_query.run(query_string=query_string, output_location='s3://'+bucket+'/'+prefix+'/fs_query_results/')
fs_query.wait()
model_data = fs_query.as_dataframe()

INFO:sagemaker:Query 1c5c6cfb-e065-49ca-8aa8-5cbf60d59389 is being executed.
INFO:sagemaker:Query 1c5c6cfb-e065-49ca-8aa8-5cbf60d59389 successfully executed.


In [78]:
model_data

Unnamed: 0,sensor_id,vehicle_count,avg_speed,incident,weather_condition_fog,weather_condition_rain,weather_condition_clear,weather_condition_snow,timestamp_month,write_time,api_invocation_time,is_deleted
0,2,4.685296,8.589093,0,1.0,0.0,0.0,0.0,1,2024-10-17 00:38:15.942,2024-10-17 00:33:19.000,False
1,4,4.239077,10.535598,0,0.0,0.0,0.0,1.0,1,2024-10-17 00:38:15.942,2024-10-17 00:33:19.000,False
2,2,3.569749,9.831642,0,0.0,0.0,1.0,0.0,1,2024-10-17 00:38:15.942,2024-10-17 00:33:19.000,False
3,3,4.908405,10.714162,0,0.0,1.0,0.0,0.0,1,2024-10-17 00:38:15.942,2024-10-17 00:33:19.000,False
4,1,4.239077,7.996590,0,0.0,0.0,1.0,0.0,1,2024-10-17 00:38:15.942,2024-10-17 00:33:19.000,False
...,...,...,...,...,...,...,...,...,...,...,...,...
692,5,4.685296,8.369228,1,0.0,0.0,0.0,1.0,1,2024-10-17 00:38:15.888,2024-10-17 00:33:20.000,False
693,1,6.023952,8.490530,1,1.0,0.0,0.0,0.0,1,2024-10-17 00:38:15.888,2024-10-17 00:33:20.000,False
694,1,3.792859,9.846547,1,0.0,1.0,0.0,0.0,1,2024-10-17 00:38:15.888,2024-10-17 00:33:20.000,False
695,1,3.123531,9.182528,1,0.0,1.0,0.0,0.0,1,2024-10-17 00:38:15.888,2024-10-17 00:33:20.000,False


In [79]:
#remove unwanted columns
model_data = model_data.drop(['write_time', 'api_invocation_time', 'is_deleted'], axis=1)

In [80]:
model_data

Unnamed: 0,sensor_id,vehicle_count,avg_speed,incident,weather_condition_fog,weather_condition_rain,weather_condition_clear,weather_condition_snow,timestamp_month
0,2,4.685296,8.589093,0,1.0,0.0,0.0,0.0,1
1,4,4.239077,10.535598,0,0.0,0.0,0.0,1.0,1
2,2,3.569749,9.831642,0,0.0,0.0,1.0,0.0,1
3,3,4.908405,10.714162,0,0.0,1.0,0.0,0.0,1
4,1,4.239077,7.996590,0,0.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...
692,5,4.685296,8.369228,1,0.0,0.0,0.0,1.0,1
693,1,6.023952,8.490530,1,1.0,0.0,0.0,0.0,1
694,1,3.792859,9.846547,1,0.0,1.0,0.0,0.0,1
695,1,3.123531,9.182528,1,0.0,1.0,0.0,0.0,1


In [81]:
# Prepare data SageMaker's XGBoost
# Amazon SageMaker's XGBoost container expects data in CSV data format.
# Note that the first column must be the target variable and the CSV should not include headers.

# move target (ie, incident) to first column
model_data2 = model_data.reindex(columns = ['incident', 'sensor_id', 'vehicle_count', 'avg_speed', 'weather_condition_fog',
                                         'weather_condition_rain', 'weather_condition_clear', 'weather_condition_snow',	'timestamp_month'])
model_data2

Unnamed: 0,incident,sensor_id,vehicle_count,avg_speed,weather_condition_fog,weather_condition_rain,weather_condition_clear,weather_condition_snow,timestamp_month
0,0,2,4.685296,8.589093,1.0,0.0,0.0,0.0,1
1,0,4,4.239077,10.535598,0.0,0.0,0.0,1.0,1
2,0,2,3.569749,9.831642,0.0,0.0,1.0,0.0,1
3,0,3,4.908405,10.714162,0.0,1.0,0.0,0.0,1
4,0,1,4.239077,7.996590,0.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...
692,1,5,4.685296,8.369228,0.0,0.0,0.0,1.0,1
693,1,1,6.023952,8.490530,1.0,0.0,0.0,0.0,1
694,1,1,3.792859,9.846547,0.0,1.0,0.0,0.0,1
695,1,1,3.123531,9.182528,0.0,1.0,0.0,0.0,1


In [82]:
# remove the headers
model_data2.columns = range(model_data2.shape[1])   # Delete headers

model_data2

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,2,4.685296,8.589093,1.0,0.0,0.0,0.0,1
1,0,4,4.239077,10.535598,0.0,0.0,0.0,1.0,1
2,0,2,3.569749,9.831642,0.0,0.0,1.0,0.0,1
3,0,3,4.908405,10.714162,0.0,1.0,0.0,0.0,1
4,0,1,4.239077,7.996590,0.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...
692,1,5,4.685296,8.369228,0.0,0.0,0.0,1.0,1
693,1,1,6.023952,8.490530,1.0,0.0,0.0,0.0,1
694,1,1,3.792859,9.846547,0.0,1.0,0.0,0.0,1
695,1,1,3.123531,9.182528,0.0,1.0,0.0,0.0,1


In [83]:
# separate data into train/test data split

# The model will be trained on 70% of data, it will then be evaluated on 20% of data to give us an estimate of the accuracy 
# we hope to have on "new" data, and 10% will be held back as a final testing dataset which will be used later on.


# Compute split indices
train_end = int(0.7 * len(model_data2))
val_end = int(0.9 * len(model_data2))

# Split the DataFrame into train, validation, and test sets
train_data = model_data2.iloc[:train_end]
validation_data = model_data2.iloc[train_end:val_end]
test_data = model_data2.iloc[val_end:]

In [84]:
len(train_data)

487

In [85]:
len(validation_data)

140

In [86]:
len(test_data)

70

In [87]:
# train data to CSV
train_data.to_csv('train.csv', index=False, header=False)
validation_data.to_csv('validation.csv', index=False, header=False)

In [88]:
# copy data to S3 for SageMaker to access
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

# Training

In [89]:
# specify the ECR container location for Amazon SageMaker's implementation of XGBoost

container = sagemaker.image_uris.retrieve(region=boto3.Session().region_name, framework='xgboost', version='latest')

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [90]:
# Then, because we're training with the CSV file format, we'll create s3_inputs that our training function can use 
# as a pointer to the files in S3, which also specify that the content type is CSV.
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/'.format(bucket, prefix), content_type='csv')

First we'll need to specify training parameters to the estimator. This includes:

The xgboost algorithm container
The IAM role to use
Training instance type and count
S3 location for output data
Algorithm hyperparameters
And then a .fit() function which specifies:

S3 location for output data. In this case we have both a training and validation set which are passed in.

In [91]:
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    instance_count=1, 
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=5, #The maximum depth per tree. A deeper tree might increase the performance, 
                                     #but also the complexity and chances to overfit.
                        eta=0.2, #learning rate
                        gamma=4, #Gamma is a pseudo-regularisation parameter (Lagrangian multiplier), and depends on the other parameters. 
                                 #The higher Gamma is, the higher the regularization.
                        min_child_weight=6,
                        subsample=0.8, #Represents the fraction of observations to be sampled for each tree. 
                                       #A lower values prevent overfitting but might lead to under-fitting.
                        silent=0,
                        objective='reg:linear', #RMSE is the default metric for linear regression; 
                                                #The SageMaker XGBoost algorithm actually calculates RMSE and writes it to the 
                                                #CloudWatch logs on the data passed to the “validation” channel.
                        num_round=100 #number of trees
                       )

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation}) 

INFO:sagemaker:Creating training-job with name: xgboost-2024-10-18-21-26-13-411


2024-10-18 21:26:15 Starting - Starting the training job...
2024-10-18 21:26:28 Starting - Preparing the instances for training...
2024-10-18 21:26:51 Downloading - Downloading input data...
2024-10-18 21:27:22 Downloading - Downloading the training image...
2024-10-18 21:28:09 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2024-10-18:21:28:21:INFO] Running standalone xgboost training.[0m
[34m[2024-10-18:21:28:21:INFO] File size need to be processed in the node: 0.03mb. Available memory size in the node: 8457.15mb[0m
[34m[2024-10-18:21:28:21:INFO] Determined delimiter of CSV input is ','[0m
[34m[21:28:21] S3DistributionType set as FullyReplicated[0m
[34m[21:28:21] 487x8 matrix with 3896 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-10-18:21:28:21:INFO] Determined delimiter of CSV input is ','[0m
[34m[21:28:21] S3DistributionType set as FullyReplicated[0m
[34m[21:28:21] 1